[PATCH] D84260: [OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3

Saiyedul Islam via Phabricator via cfe-commits Tue, 21 Jul 2020 12:40:14 -0700

saiislam updated this revision to Diff 279610.
saiislam added a comment.

Added final to specialized classes.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84260/new/

https://reviews.llvm.org/D84260

Files:
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  clang/lib/CodeGen/CMakeLists.txt
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/OpenMP/amdgcn_target_codegen.cpp

Index: clang/test/OpenMP/amdgcn_target_codegen.cpp
===================================================================
--- /dev/null
+++ clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -0,0 +1,45 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+
+int test_amdgcn_target_tid_threads() {
+  // CHECK-LABEL: test_amdgcn_target_tid_threads
+  // CHECK-LABEL: entry:
+
+  int arr[N];
+
+// CHECK: %nvptx_tid = call i32 @llvm.amdgcn.workitem.id.x()
+// CHECK-DAG: %nvptx_num_threads = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-DAG: [[VAR1:%[0-9]+]] = trunc i64 %nvptx_num_threads to i32
+// CHECK-DAG: %thread_limit = sub nuw i32 [[VAR1]], 64
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_tid_threads_simd() {
+  // CHECK-LABEL: test_amdgcn_target_tid_threads_simd
+  // CHECK-LABEL: entry:
+
+  int arr[N];
+
+// CHECK: %nvptx_num_threads = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-DAG: [[VAR2:%[0-9]+]] = trunc i64 %nvptx_num_threads to i32
+// CHECK-DAG: call void @__kmpc_spmd_kernel_init(i32 [[VAR2]], i16 0, i16 0)
+#pragma omp target simd
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+  return arr[0];
+}
+
+#endif
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -19,6 +19,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
@@ -215,6 +216,11 @@
            "OpenMP NVPTX is only prepared to deal with device code.");
     OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
     break;
+  case llvm::Triple::amdgcn:
+    assert(getLangOpts().OpenMPIsDevice &&
+           "OpenMP AMDGCN is only prepared to deal with device code.");
+    OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
+    break;
   default:
     if (LangOpts.OpenMPSimd)
       OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
Index: clang/lib/CodeGen/CMakeLists.txt
===================================================================
--- clang/lib/CodeGen/CMakeLists.txt
+++ clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@
   CGObjCRuntime.cpp
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
+  CGOpenMPRuntimeAMDGCN.cpp
   CGOpenMPRuntimeGPU.cpp
   CGOpenMPRuntimeNVPTX.cpp
   CGRecordLayoutBuilder.cpp
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -39,3 +39,21 @@
           &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
       "nvptx_warp_size");
 }
+
+/// Get the id of the current thread on the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+/// Get the maximum number of threads in a block of the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
+}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
 //
 //===----------------------------------------------------------------------===//
 
@@ -199,8 +199,11 @@
   void clear() override;
 
   /// Declare generalized virtual functions which need to be defined
-  /// by all specializations of OpenMPGPURuntime Targets.
+  /// by all specializations of OpenMPGPURuntime Targets like AMDGCN
+  /// and NVPTX.
   virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
+  virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
+  virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
 //
 //===----------------------------------------------------------------------===//
 
@@ -621,14 +621,6 @@
 };
 } // anonymous namespace
 
-/// Get the id of the current thread on the GPU.
-static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
-      "nvptx_tid");
-}
-
 /// Get the id of the warp in the block.
 /// We assume that the warp size is 32, which is always the case
 /// on the NVPTX device, to generate more efficient code.
@@ -636,7 +628,8 @@
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDBits =
       CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
-  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 }
 
 /// Get the id of the current lane in the Warp.
@@ -646,18 +639,11 @@
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
       llvm::omp::GV_Warp_Size_Log2_Mask);
-  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
                        "nvptx_lane_id");
 }
 
-/// Get the maximum number of threads in a block of the GPU.
-static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
-      "nvptx_num_threads");
-}
-
 /// Get the value of the thread_limit clause in the teams directive.
 /// For the 'generic' execution mode, the runtime encodes thread_limit in
 /// the launch parameters, always starting thread_limit+warpSize threads per
@@ -668,9 +654,9 @@
   CGBuilderTy &Bld = CGF.Builder;
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   return IsInSPMDExecutionMode
-             ? getNVPTXNumThreads(CGF)
-             : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
-                                "thread_limit");
+             ? RT.getGPUNumThreads(CGF)
+             : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
+                                RT.getGPUWarpSize(CGF), "thread_limit");
 }
 
 /// Get the thread id of the OMP master thread.
@@ -682,8 +668,8 @@
 ///      If NumThreads is 1024, master id is 992.
 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
-  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
   // We assume that the warp size is a power of 2.
   llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
 
@@ -1235,8 +1221,9 @@
   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
   EST.ExitBB = CGF.createBasicBlock(".exit");
 
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   llvm::Value *IsWorker =
-      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
+      Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
 
   CGF.EmitBlock(WorkerBB);
@@ -1245,7 +1232,7 @@
 
   CGF.EmitBlock(MasterCheckBB);
   llvm::Value *IsMaster =
-      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+      Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
 
   CGF.EmitBlock(MasterBB);
@@ -2780,14 +2767,16 @@
   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
 
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+
   // Get the mask of active threads in the warp.
   llvm::Value *Mask = CGF.EmitRuntimeCall(
       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
   // Fetch team-local id of the thread.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
 
   // Get the width of the team.
-  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
+  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
 
   // Initialize the counter variable for the loop.
   QualType Int32Ty =
@@ -3250,8 +3239,9 @@
     CGM.addCompilerUsedGlobal(TransferMedium);
   }
 
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   // nvptx_lane_id = nvptx_id % warpsize
   llvm::Value *LaneID = getNVPTXLaneID(CGF);
   // nvptx_warp_id = nvptx_id / warpsize
@@ -4844,9 +4834,11 @@
     CodeGenFunction &CGF, const OMPLoopDirective &S,
     OpenMPDistScheduleClauseKind &ScheduleKind,
     llvm::Value *&Chunk) const {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
     ScheduleKind = OMPC_DIST_SCHEDULE_static;
-    Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
+    Chunk = CGF.EmitScalarConversion(
+        RT.getGPUNumThreads(CGF),
         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
         S.getIterationVariable()->getType(), S.getBeginLoc());
     return;
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -22,11 +22,13 @@
 namespace clang {
 namespace CodeGen {
 
-class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
+class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
 
 public:
   explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
 };
 
 } // CodeGen namespace.
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -1,4 +1,4 @@
-//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===//
+//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This provides a class for OpenMP runtime code generation specialized to NVPTX
-// targets from generalized CGOpenMPRuntimeGPU class.
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
-#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
 
 #include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
@@ -22,14 +22,16 @@
 namespace clang {
 namespace CodeGen {
 
-class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
+class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
 
 public:
-  explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+  explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
 };
 
-} // CodeGen namespace.
-} // clang namespace.
+} // namespace CodeGen
+} // namespace clang
 
-#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
===================================================================
--- /dev/null
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -0,0 +1,64 @@
+//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Cuda.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+using namespace clang;
+using namespace CodeGen;
+using namespace llvm::omp;
+
+CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
+    : CGOpenMPRuntimeGPU(CGM) {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+}
+
+/// Get the GPU warp size.
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  // return constant compile-time target-specific warp size
+  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  return Bld.getInt32(WarpSize);
+}
+
+/// Get the id of the current thread on the GPU.
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+/// Get the maximum number of threads in a block of the GPU.
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Module *M = &CGF.CGM.getModule();
+  const char *LocSize = "__ockl_get_local_size";
+  llvm::Function *F = M->getFunction(LocSize);
+  if (!F) {
+    F = llvm::Function::Create(
+        llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+        llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
+  }
+  return Bld.CreateTrunc(
+      Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+}

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D84260: [OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3

Reply via email to