jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, JonChesterfield, tra, yaxunl.
Herald added a subscriber: carlosgalvezp.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1.
Herald added a project: clang.

This patch adds the basic support for the clang driver to compile and link CUDA
using the new offloading driver. This requires handling the CUDA offloading kind
and embedding the generated files into the host. This will allow us to link
OpenMP code with CUDA code in the linker wrapper.

Depends on D120270 <https://reviews.llvm.org/D120270> D120271 
<https://reviews.llvm.org/D120271>


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D120272

Files:
  clang/include/clang/Driver/Compilation.h
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/Clang.cpp

Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4374,6 +4374,7 @@
   // one input.
   bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   bool IsCudaDevice = JA.isDeviceOffloading(Action::OFK_Cuda);
+  bool IsCudaHost = JA.isHostOffloading(Action::OFK_Cuda);
   bool IsHIP = JA.isOffloading(Action::OFK_HIP);
   bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP);
   bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
@@ -4397,6 +4398,7 @@
 
   InputInfoList ModuleHeaderInputs;
   InputInfoList OpenMPHostInputs;
+  InputInfoList CudaHostInputs;
   const InputInfo *CudaDeviceInput = nullptr;
   const InputInfo *OpenMPDeviceInput = nullptr;
   for (const InputInfo &I : Inputs) {
@@ -4411,6 +4413,8 @@
             << types::getTypeName(Expected);
       }
       ModuleHeaderInputs.push_back(I);
+    } else if (IsCudaHost && Args.hasArg(options::OPT_fopenmp_new_driver)) {
+      CudaHostInputs.push_back(I);
     } else if ((IsCuda || IsHIP) && !CudaDeviceInput) {
       CudaDeviceInput = &I;
     } else if (IsOpenMPDevice && !OpenMPDeviceInput) {
@@ -6929,6 +6933,7 @@
     auto OpenMPTCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
     for (auto TI = OpenMPTCs.first, TE = OpenMPTCs.second; TI != TE;
          ++TI, ++InputFile) {
+      assert(InputFile->isFilename() && "Offloading requires a filename");
       const ToolChain *TC = TI->second;
       const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP);
       StringRef File =
@@ -6941,6 +6946,25 @@
           TC->getTripleString() + "." +
           TCArgs.getLastArgValue(options::OPT_march_EQ) + "." + InputName));
     }
+  } else if (IsCudaHost && !CudaHostInputs.empty()) {
+    const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+    for (const auto &InputFile : CudaHostInputs) {
+      assert(InputFile.isFilename() && "Offloading requires a filename");
+      StringRef File =
+          C.getArgs().MakeArgString(CudaTC->getInputFilename(InputFile));
+      StringRef InputName = Clang::getBaseInputStem(Args, Inputs);
+      // The CUDA toolchain should have a bound arch appended to the filename.
+      StringRef Arch = File.split(".").first.rsplit('-').second;
+      // CUDA offloads both the PTX and Cubin so we need a uniqe section name.
+      if (File.endswith(".s"))
+        CmdArgs.push_back(Args.MakeArgString(
+            "-fembed-offload-object=" + File + "," + "cuda." +
+            CudaTC->getTripleString() + "." + Arch + ".ptx." + InputName));
+      else
+        CmdArgs.push_back(Args.MakeArgString(
+            "-fembed-offload-object=" + File + "," + "cuda." +
+            CudaTC->getTripleString() + "." + Arch + "." + InputName));
+    }
   }
 
   if (Triple.isAMDGPU()) {
@@ -8189,6 +8213,7 @@
   const Driver &D = getToolChain().getDriver();
   const llvm::Triple TheTriple = getToolChain().getTriple();
   auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
+  auto CudaTCRange = C.getOffloadToolChains<Action::OFK_Cuda>();
   ArgStringList CmdArgs;
 
   // Pass the CUDA path to the linker wrapper tool.
@@ -8202,6 +8227,16 @@
       break;
     }
   }
+  for (auto &I : llvm::make_range(CudaTCRange.first, CudaTCRange.second)) {
+    const ToolChain *TC = I.second;
+    if (TC->getTriple().isNVPTX()) {
+      CudaInstallationDetector CudaInstallation(D, TheTriple, Args);
+      if (CudaInstallation.isValid())
+        CmdArgs.push_back(Args.MakeArgString(
+            "--cuda-path=" + CudaInstallation.getInstallPath()));
+      break;
+    }
+  }
 
   // Get the AMDGPU math libraries.
   // FIXME: This method is bad, remove once AMDGPU has a proper math library
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -3830,11 +3830,6 @@
   // Builder to be used to build offloading actions.
   OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
 
-  // Offload kinds active for this compilation.
-  unsigned OffloadKinds = Action::OFK_None;
-  if (C.hasOffloadToolChain<Action::OFK_OpenMP>())
-    OffloadKinds |= Action::OFK_OpenMP;
-
   // Construct the actions to perform.
   HeaderModulePrecompileJobAction *HeaderModuleAction = nullptr;
   ActionList LinkerInputs;
@@ -3935,7 +3930,7 @@
     if (!Args.hasArg(options::OPT_fopenmp_new_driver))
       OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
     else if (Current)
-      Current->propagateHostOffloadInfo(OffloadKinds,
+      Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
                                         /*BoundArch=*/nullptr);
   }
 
@@ -3956,9 +3951,9 @@
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
     } else if (Args.hasArg(options::OPT_fopenmp_new_driver) &&
-               OffloadKinds != Action::OFK_None) {
+               C.getActiveOffloadKinds() != Action::OFK_None) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
-      LA->propagateHostOffloadInfo(OffloadKinds,
+      LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
                                    /*BoundArch=*/nullptr);
     } else {
       LA = C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image);
@@ -4050,6 +4045,67 @@
   Args.ClaimAllArgs(options::OPT_cuda_compile_host_device);
 }
 
+static StringRef getCanonicalArchString(Compilation &C,
+                                        llvm::opt::DerivedArgList &Args,
+                                        StringRef ArchStr,
+                                        Action::OffloadKind Kind) {
+  if (Kind == Action::OFK_Cuda) {
+    CudaArch Arch = StringToCudaArch(ArchStr);
+    if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
+      C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+      return StringRef();
+    }
+    return Args.MakeArgStringRef(CudaArchToString(Arch));
+  } else {
+    llvm::StringMap<bool> Features;
+    // getHIPOffloadTargetTriple() is known to return valid value as it has
+    // been called successfully in the CreateOffloadingDeviceToolChains().
+    auto Arch = parseTargetID(
+        *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), ArchStr,
+        &Features);
+    if (!Arch) {
+      C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
+      C.setContainsError();
+      return StringRef();
+    }
+    return Args.MakeArgStringRef(
+        getCanonicalTargetID(Arch.getValue(), Features));
+  }
+  return StringRef();
+}
+
+static llvm::DenseSet<StringRef>
+getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
+                Action::OffloadKind Kind) {
+
+  // If this is OpenMP offloading we don't use a bound architecture.
+  if (Kind == Action::OFK_OpenMP)
+    return llvm::DenseSet<StringRef>{StringRef()};
+
+  // --offload and --offload-arch options are mutually exclusive.
+  if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
+      Args.hasArgNoClaim(options::OPT_offload_arch_EQ,
+                         options::OPT_no_offload_arch_EQ)) {
+    C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "--offload-arch"
+                                                         << "--offload";
+  }
+
+  llvm::DenseSet<StringRef> Archs;
+  for (auto &Arg : Args.getAllArgValues(options::OPT_offload_arch_EQ))
+    Archs.insert(getCanonicalArchString(C, Args, Arg, Kind));
+  for (auto &Arg : Args.getAllArgValues(options::OPT_no_offload_arch_EQ))
+    Archs.erase(getCanonicalArchString(C, Args, Arg, Kind));
+
+  if (Archs.empty()) {
+    if (Kind == Action::OFK_Cuda)
+      Archs.insert(CudaArchToString(CudaArch::SM_35));
+    else if (Kind == Action::OFK_Cuda)
+      Archs.insert(CudaArchToString(CudaArch::GFX803));
+  }
+
+  return Archs;
+}
+
 Action *Driver::BuildOffloadingActions(Compilation &C,
                                        llvm::opt::DerivedArgList &Args,
                                        const InputTy &Input,
@@ -4057,53 +4113,73 @@
   if (!isa<CompileJobAction>(HostAction))
     return HostAction;
 
-  SmallVector<const ToolChain *, 2> ToolChains;
-  ActionList DeviceActions;
+  OffloadAction::DeviceDependences DDeps;
 
   types::ID InputType = Input.first;
   const Arg *InputArg = Input.second;
 
-  auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
-  for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; ++TI)
-    ToolChains.push_back(TI->second);
+  const Action::OffloadKind OffloadKinds[] = {
+      Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP};
 
-  for (unsigned I = 0; I < ToolChains.size(); ++I)
-    DeviceActions.push_back(C.MakeAction<InputAction>(*InputArg, InputType));
+  for (Action::OffloadKind Kind : OffloadKinds) {
+    SmallVector<const ToolChain *, 2> ToolChains;
+    ActionList DeviceActions;
 
-  if (DeviceActions.empty())
-    return HostAction;
+    auto TCRange = C.getOffloadToolChains(Kind);
+    for (auto TI = TCRange.first, TE = TCRange.second; TI != TE; ++TI)
+      ToolChains.push_back(TI->second);
 
-  auto PL = types::getCompilationPhases(*this, Args, InputType);
+    if (ToolChains.empty())
+      continue;
 
-  for (phases::ID Phase : PL) {
-    if (Phase == phases::Link) {
-      assert(Phase == PL.back() && "linking must be final compilation step.");
-      break;
-    }
+    // TODO: Handle CUID.
 
-    auto TC = ToolChains.begin();
-    for (Action *&A : DeviceActions) {
-      A = ConstructPhaseAction(C, Args, Phase, A, Action::OFK_OpenMP);
+    // Get the product of all bound architectures and toolchains.
+    SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
+    for (const ToolChain *TC : ToolChains)
+      for (StringRef Arch : getOffloadArchs(C, Args, Kind))
+        TCAndArchs.push_back(std::make_pair(TC, Arch));
 
-      if (isa<CompileJobAction>(A)) {
-        HostAction->setCannotBeCollapsedWithNextDependentAction();
-        OffloadAction::HostDependence HDep(
-            *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
-            /*BourdArch=*/nullptr, Action::OFK_OpenMP);
-        OffloadAction::DeviceDependences DDep;
-        DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
-        A = C.MakeAction<OffloadAction>(HDep, DDep);
+    for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I)
+      DeviceActions.push_back(C.MakeAction<InputAction>(*InputArg, InputType));
+
+    if (DeviceActions.empty())
+      return HostAction;
+
+    auto PL = types::getCompilationPhases(*this, Args, InputType);
+
+    for (phases::ID Phase : PL) {
+      if (Phase == phases::Link) {
+        assert(Phase == PL.back() && "linking must be final compilation step.");
+        break;
       }
-      ++TC;
-    }
-  }
 
-  OffloadAction::DeviceDependences DDeps;
+      auto TCAndArch = TCAndArchs.begin();
+      for (Action *&A : DeviceActions) {
+        A = ConstructPhaseAction(C, Args, Phase, A, Kind);
 
-  auto TC = ToolChains.begin();
-  for (Action *A : DeviceActions) {
-    DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
-    TC++;
+        if (isa<CompileJobAction>(A) && Kind == Action::OFK_OpenMP) {
+          HostAction->setCannotBeCollapsedWithNextDependentAction();
+          OffloadAction::HostDependence HDep(
+              *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+              /*BoundArch=*/nullptr, Kind);
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind);
+          A = C.MakeAction<OffloadAction>(HDep, DDep);
+          ++TCAndArch;
+        }
+      }
+    }
+
+    auto TCAndArch = TCAndArchs.begin();
+    for (Action *A : DeviceActions) {
+      if (isa<AssembleJobAction>(A) &&
+          (Kind == Action::OFK_Cuda || Kind == Action::OFK_HIP))
+        DDeps.add(*A->getInputs()[0], *TCAndArch->first,
+                  TCAndArch->second.data(), Kind);
+      DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
+      ++TCAndArch;
+    }
   }
 
   OffloadAction::HostDependence HDep(
@@ -4204,7 +4280,7 @@
       return C.MakeAction<BackendJobAction>(Input, Output);
     }
     if (isUsingLTO(/* IsOffload */ true) &&
-        TargetDeviceOffloadKind == Action::OFK_OpenMP) {
+        TargetDeviceOffloadKind != Action::OFK_None) {
       types::ID Output =
           Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC;
       return C.MakeAction<BackendJobAction>(Input, Output);
Index: clang/include/clang/Driver/Compilation.h
===================================================================
--- clang/include/clang/Driver/Compilation.h
+++ clang/include/clang/Driver/Compilation.h
@@ -143,6 +143,8 @@
     return ActiveOffloadMask & Kind;
   }
 
+  unsigned getActiveOffloadKinds() const { return ActiveOffloadMask; }
+
   /// Iterator that visits device toolchains of a given kind.
   using const_offload_toolchains_iterator =
       const std::multimap<Action::OffloadKind,
@@ -156,6 +158,11 @@
     return OrderedOffloadingToolchains.equal_range(Kind);
   }
 
+  const_offload_toolchains_range
+  getOffloadToolChains(Action::OffloadKind Kind) const {
+    return OrderedOffloadingToolchains.equal_range(Kind);
+  }
+
   /// Return true if an offloading tool chain of a given kind exists.
   template <Action::OffloadKind Kind> bool hasOffloadToolChain() const {
     return OrderedOffloadingToolchains.find(Kind) !=
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to