jdoerfert created this revision.
jdoerfert added reviewers: jhuber6, fghanim, JonChesterfield, grokos, 
AndreyChurbanov, ye-luo, tianshilei1992, ggeorgakoudis.
Herald added subscribers: llvm-commits, cfe-commits, sstefan1, guansong, bollu, 
hiraditya, yaxunl.
Herald added projects: clang, LLVM.

Since D83271 <https://reviews.llvm.org/D83271> we can optimize the GPU state 
machine to avoid spurious
call edges that increase the register usage of kernels. With this patch
we inform the user why and if this optimization is happening and when it
is not.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D83707

Files:
  clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
  llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1033,6 +1033,7 @@
     // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
     // all.
     bool UnknownUse = false;
+    bool KernelPrepareUse = false;
     unsigned NumDirectCalls = 0;
 
     SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
@@ -1049,31 +1050,81 @@
       }
       if (OpenMPOpt::getCallIfRegularCall(*U.getUser(),
                                           &KernelPrepareParallelRFI)) {
+        KernelPrepareUse = true;
         ToBeReplacedStateMachineUses.push_back(&U);
         return;
       }
       UnknownUse = true;
     });
 
-    // If this ever hits, we should investigate.
-    if (UnknownUse || NumDirectCalls != 1)
+    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+    // use.
+    if (!KernelPrepareUse)
       continue;
 
-    // TODO: This is not a necessary restriction and should be lifted.
-    if (ToBeReplacedStateMachineUses.size() != 2)
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR
+               << "Found parallel region that is called through a state machine"
+               << ore::NV("OpenMPParallelRegion", F->getName())
+               << " in non-SPMD target region. This can lead to excessive "
+                  "register usage in unrelated kernels in the same translation "
+                  "unit due to spurious call edges assumed by ptxas.";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
+    // If this ever hits, we should investigate.
+    // TODO: Checking the number of uses is not a necessary restriction and
+    // should be lifted.
+    if (UnknownUse || NumDirectCalls != 1 ||
+        ToBeReplacedStateMachineUses.size() != 2) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region "
+                    << ore::NV("OpenMPParallelRegion", F->getName()) << " has "
+                    << (UnknownUse ? "unknown" : "unexpected")
+                    << " uses; will not attempt to rewrite the state machine.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+      }
       continue;
+    }
 
     // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
     // up if the function is not called from a unique kernel.
     Kernel K = getUniqueKernelFor(*F);
-    if (!K)
+    if (!K) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region "
+                    << ore::NV("OpenMPParallelRegion", F->getName())
+                    << " is not known to be called from a single target region "
+                       "only, maybe the surrounding function has external "
+                       "linkage?; "
+                       "will not attempt to rewrite the state machine use.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+                             Remark);
+      }
       continue;
+    }
 
     // We now know F is a parallel body function called only from the kernel K.
     // We also identified the state machine uses in which we replace the
     // function pointer by a new global symbol for identification purposes. This
     // ensures only direct calls to the function are left.
 
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Replace state machine uses of parallel region "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << " called from kernel "
+                  << ore::NV("OpenMPParallelRegionKernel", K->getName()) << ".";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
     Module &M = *F->getParent();
     Type *Int8Ty = Type::getInt8Ty(M.getContext());
 
Index: clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
===================================================================
--- /dev/null
+++ clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1                                 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+
+// host-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void bar(void) {
+    #pragma omp parallel // #1
+    // expected-remark@#1 {{Found parallel region that is called through a state machine__omp_outlined__2_wrapper in non-SPMD target region. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
+    // expected-remark@#1 {{Parallel region __omp_outlined__2_wrapper is not known to be called from a single target region only, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+    {
+    }
+}
+
+void foo(void) {
+  #pragma omp target teams
+  {
+    #pragma omp parallel // #2
+    // expected-remark@#2 {{Found parallel region that is called through a state machine__omp_outlined__1_wrapper in non-SPMD target region. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
+    // expected-remark@#2 {{Replace state machine uses of parallel region __omp_outlined__1_wrapper called from kernel __omp_offloading_35_}}
+    {
+    }
+    bar();
+    #pragma omp parallel // #3
+    // expected-remark@#3 {{Found parallel region that is called through a state machine__omp_outlined__3_wrapper in non-SPMD target region. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
+    // expected-remark@#3 {{Replace state machine uses of parallel region __omp_outlined__3_wrapper called from kernel __omp_offloading_35_}}
+    {
+    }
+  }
+}
+
+#endif
+
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}}
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to