[clang] [CUDA] Add device-side kernel launch support (PR #165519)

via cfe-commits Wed, 19 Nov 2025 12:20:47 -0800

https://github.com/darkbuck updated 
https://github.com/llvm/llvm-project/pull/165519


>From 613b656c7c36c8c41d54fa30dc7a8131467c69eb Mon Sep 17 00:00:00 2001
From: Michael Liao <[email protected]>
Date: Sat, 18 Oct 2025 19:46:39 -0400
Subject: [PATCH] [CUDA] Add device-side kernel launch support

- CUDA's dynamic parallelism extension allows device-side kernel
  launches, which share the identical syntax to host-side launches,
  e.g.,

    kernel<<<Dg, Db, Ns, S>>>(arguments);

  but differ from the code generation. That device-side kernel launches
  is eventually translated into the following sequence

    config = cudaGetParameterBuffer(alignment, size);
    // setup arguments by copying them into `config`.
    cudaLaunchDevice(func, config, Dg, Db, Ns, S);

- To support the device-side kernel launch, 'CUDAKernelCallExpr' is
  reused but its config expr is set to a call to 'cudaLaunchDevice'.
  During the code generation, 'CUDAKernelCallExpr' is expanded into the
  sequence aforementioned.

- As the device-side kernel launch requires the code to be compiled as
  relocatable device code and linked with '-lcudadevrt'.
  'clang-nvlink-wrapper' is modified to forward archives with fat
  binaries directly.
---
 clang/include/clang/AST/ASTContext.h          |  16 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |   8 ++
 clang/include/clang/Sema/SemaCUDA.h           |   5 +
 clang/include/clang/Serialization/ASTReader.h |   2 +-
 clang/lib/CodeGen/CGCUDARuntime.cpp           | 106 ++++++++++++++++++
 clang/lib/CodeGen/CGCUDARuntime.h             |   4 +
 clang/lib/CodeGen/CGExprCXX.cpp               |   6 +
 clang/lib/Sema/SemaCUDA.cpp                   |  99 +++++++++++++++-
 clang/lib/Sema/SemaDecl.cpp                   |  32 ++++--
 clang/lib/Serialization/ASTReader.cpp         |   8 +-
 clang/lib/Serialization/ASTWriter.cpp         |  37 +++---
 clang/test/CodeGenCUDA/Inputs/cuda.h          |   8 +-
 clang/test/CodeGenCUDA/device-kernel-call.cu  |  35 ++++++
 clang/test/SemaCUDA/Inputs/cuda.h             |   7 ++
 .../test/SemaCUDA/call-kernel-from-kernel.cu  |   5 +-
 clang/test/SemaCUDA/device-kernel-call.cu     |  23 ++++
 clang/test/SemaCUDA/function-overload.cu      |  26 ++---
 clang/test/SemaCUDA/function-target.cu        |   4 +-
 clang/test/SemaCUDA/reference-to-kernel-fn.cu |   4 +-
 .../ClangNVLinkWrapper.cpp                    |  46 ++++++++
 20 files changed, 426 insertions(+), 55 deletions(-)
 create mode 100644 clang/test/CodeGenCUDA/device-kernel-call.cu
 create mode 100644 clang/test/SemaCUDA/device-kernel-call.cu

diff --git a/clang/include/clang/AST/ASTContext.h 
b/clang/include/clang/AST/ASTContext.h
index 6e9e737dcae4f..303e8f0e9a7a4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -500,6 +500,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
 
   /// Declaration for the CUDA cudaConfigureCall function.
   FunctionDecl *cudaConfigureCallDecl = nullptr;
+  /// Declaration for the CUDA cudaGetParameterBuffer function.
+  FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+  /// Declaration for the CUDA cudaLaunchDevice function.
+  FunctionDecl *cudaLaunchDeviceDecl = nullptr;
 
   /// Keeps track of all declaration attributes.
   ///
@@ -1653,6 +1657,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return cudaConfigureCallDecl;
   }
 
+  void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+    cudaGetParameterBufferDecl = FD;
+  }
+
+  FunctionDecl *getcudaGetParameterBufferDecl() {
+    return cudaGetParameterBufferDecl;
+  }
+
+  void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+  FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
   /// Returns true iff we need copy/dispose helpers for the given type.
   bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3e864475f22a1..b18585b545226 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9512,6 +9512,8 @@ def err_kern_is_nonstatic_method : Error<
   "kernel function %0 must be a free function or static member function">;
 def err_config_scalar_return : Error<
   "CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+    : Error<"CUDA special function '%0' must have pointer return type">;
 def err_kern_call_not_global_function : Error<
   "kernel call to non-global function %0">;
 def err_global_call_not_config : Error<
@@ -13707,4 +13709,10 @@ def err_amdgcn_load_lds_size_invalid_value : 
Error<"invalid size value">;
 def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, 
or 4|1, 2, 4, 12 or 16}0">;
 
 def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a 
global or generic pointer">;
+
+def err_cuda_device_kernel_launch_not_supported
+    : Error<"device-side kernel call/launch is not supported">;
+def err_cuda_device_kernel_launch_require_rdc
+    : Error<"kernel launch from __device__ or __global__ function requires "
+            "relocatable device code (i.e. requires -fgpu-rdc)">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h 
b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
   /// of the function that will be called to configure kernel call, with the
   /// parameters specified via <<<>>>.
   std::string getConfigureFuncName() const;
+  /// Return the name of the parameter buffer allocation function for the
+  /// device kernel launch.
+  std::string getGetParameterBufferFuncName() const;
+  /// Return the name of the device kernel launch function.
+  std::string getLaunchDeviceFuncName() const;
 
   /// Record variables that are potentially ODR-used in CUDA/HIP.
   void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h 
b/clang/include/clang/Serialization/ASTReader.h
index a27cfe8a9b307..d276f0d21b958 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1005,7 +1005,7 @@ class ASTReader
   ///
   /// The AST context tracks a few important decls, currently 
cudaConfigureCall,
   /// directly.
-  SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+  SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
 
   /// The floating point pragma option settings.
   SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp 
b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..9cbdb641d00a1 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,112 @@ using namespace CodeGen;
 
 CGCUDARuntime::~CGCUDARuntime() {}
 
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+                                    const CUDAKernelCallExpr *E) {
+  auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+  const FunctionProtoType *GetParamBufProto =
+      GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+  DeclRefExpr *DRE = DeclRefExpr::Create(
+      CGF.getContext(), {}, {}, GetParamBuf,
+      /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+      GetParamBuf->getType(), VK_PRValue);
+  auto *ImpCast = ImplicitCastExpr::Create(
+      CGF.getContext(), 
CGF.getContext().getPointerType(GetParamBuf->getType()),
+      CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, 
FPOptionsOverride());
+
+  CGCallee Callee = CGF.EmitCallee(ImpCast);
+  CallArgList Args;
+  // Use 64B alignment.
+  Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+           CGF.getContext().getSizeType());
+  // Calculate parameter sizes.
+  const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+  const FunctionProtoType *FTP =
+      PT->getPointeeType()->getAs<FunctionProtoType>();
+  CharUnits Offset = CharUnits::Zero();
+  for (auto ArgTy : FTP->getParamTypes()) {
+    auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+    Offset = Offset.alignTo(TInfo.Align) + TInfo.Width;
+  }
+  Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+           CGF.getContext().getSizeType());
+  const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+      Args, GetParamBufProto, /*ChainCall=*/false);
+  auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+  return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+    CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+    ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+  ASTContext &Ctx = CGM.getContext();
+  assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+  llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+  llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+  llvm::Value *Config = emitGetParamBuf(CGF, E);
+  CGF.Builder.CreateCondBr(
+      CGF.Builder.CreateICmpNE(Config,
+                               
llvm::Constant::getNullValue(Config->getType())),
+      ConfigOKBlock, ContBlock);
+
+  CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+  eval.begin(CGF);
+  CGF.EmitBlock(ConfigOKBlock);
+
+  QualType KernelCalleeFuncTy =
+      E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+  CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+  // Emit kernel arguments.
+  CallArgList KernelCallArgs;
+  CGF.EmitCallArgs(KernelCallArgs,
+                   KernelCalleeFuncTy->getAs<FunctionProtoType>(),
+                   E->arguments(), E->getDirectCallee());
+  // Copy emitted kernel arguments into that parameter buffer.
+  RawAddress CfgBase(Config, CGM.Int8Ty,
+                     /*Alignment=*/CharUnits::fromQuantity(64));
+  CharUnits Offset = CharUnits::Zero();
+  for (auto &Arg : KernelCallArgs) {
+    auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+    Offset = Offset.alignTo(TInfo.Align);
+    Address Addr =
+        CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+    Arg.copyInto(CGF, Addr);
+    Offset += TInfo.Width;
+  }
+  // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+  const CallExpr *LaunchCall = E->getConfig();
+  QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+                                    ->getType()
+                                    ->getAs<PointerType>()
+                                    ->getPointeeType();
+  CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+  CallArgList LaunchCallArgs;
+  CGF.EmitCallArgs(LaunchCallArgs,
+                   LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
+                   LaunchCall->arguments(), LaunchCall->getDirectCallee());
+  // Replace func and paramterbuffer arguments.
+  LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+                              CGM.getContext().VoidPtrTy);
+  LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+  const CGFunctionInfo &LaunchCallInfo = 
CGM.getTypes().arrangeFreeFunctionCall(
+      LaunchCallArgs, LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
+      /*ChainCall=*/false);
+  CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+               CallOrInvoke,
+               /*IsMustTail=*/false, E->getExprLoc());
+  CGF.EmitBranch(ContBlock);
+
+  CGF.EmitBlock(ContBlock);
+  eval.end(CGF);
+
+  return RValue::get(nullptr);
+}
+
 RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
                                              const CUDAKernelCallExpr *E,
                                              ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h 
b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
                          ReturnValueSlot ReturnValue,
                          llvm::CallBase **CallOrInvoke = nullptr);
 
+  virtual RValue EmitCUDADeviceKernelCallExpr(
+      CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+      ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
   /// Emits a kernel launch stub.
   virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f64cf9f8a6c2d..8a2c021b2210f 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
 RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
                                                ReturnValueSlot ReturnValue,
                                                llvm::CallBase **CallOrInvoke) {
+  auto *FD = E->getConfig()->getDirectCallee();
+  // Emit as a device kernel call if the config is prepared using
+  // 'cudaGetParameterBuffer'.
+  if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+    return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+        *this, E, ReturnValue, CallOrInvoke);
   return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
                                                      CallOrInvoke);
 }
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..dd9bcab56b083 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() {
 ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
                                          MultiExprArg ExecConfig,
                                          SourceLocation GGGLoc) {
-  FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+  bool IsDeviceKernelCall = false;
+  switch (CurrentTarget()) {
+  case CUDAFunctionTarget::Global:
+  case CUDAFunctionTarget::Device:
+    IsDeviceKernelCall = true;
+    break;
+  case CUDAFunctionTarget::HostDevice:
+    if (getLangOpts().CUDAIsDevice) {
+      IsDeviceKernelCall = true;
+      if (FunctionDecl *Caller =
+              SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+          Caller && isImplicitHostDeviceFunction(Caller)) {
+        // Under the device compilation, config call under an HD function 
should
+        // be treated as a device kernel call. But, for implicit HD ones (such
+        // as lambdas), need to check whether RDC is enabled or not.
+        if (!getLangOpts().GPURelocatableDeviceCode)
+          IsDeviceKernelCall = false;
+        // HIP doesn't support device-side kernel call yet. Still treat it as
+        // the host-side kernel call.
+        if (getLangOpts().HIP)
+          IsDeviceKernelCall = false;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+
+  if (IsDeviceKernelCall && getLangOpts().HIP)
+    return ExprError(
+        Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported));
+
+  if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+    return ExprError(
+        Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+  FunctionDecl *ConfigDecl = IsDeviceKernelCall
+                                 ? getASTContext().getcudaLaunchDeviceDecl()
+                                 : getASTContext().getcudaConfigureCallDecl();
   if (!ConfigDecl)
     return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
-                     << getConfigureFuncName());
+                     << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+                                            : getConfigureFuncName()));
+  // Additional check on the launch function if it's a device kernel call.
+  if (IsDeviceKernelCall) {
+    auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+    if (!GetParamBuf)
+      return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+                       << getGetParameterBufferFuncName());
+  }
+
   QualType ConfigQTy = ConfigDecl->getType();
 
   DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
       getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
   SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
 
+  if (IsDeviceKernelCall) {
+    SmallVector<Expr *> Args;
+    // Use a null pointer as the kernel function, which may not be resolvable
+    // here. For example, resolving that kernel function may need additional
+    // kernel arguments.
+    llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+    Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+                                          SemaRef.Context.IntTy, LLLLoc));
+    // Use a null pointer as the placeholder of the parameter buffer, which
+    // should be replaced with the actual allocation later, in the codegen.
+    Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+                                          SemaRef.Context.IntTy, LLLLoc));
+    // Add the original config arguments.
+    llvm::append_range(Args, ExecConfig);
+    // Add the default blockDim if it's missing.
+    if (Args.size() < 4) {
+      llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+      Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+                                            SemaRef.Context.IntTy, LLLLoc));
+    }
+    // Add the default sharedMemSize if it's missing.
+    if (Args.size() < 5)
+      Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+                                            SemaRef.Context.IntTy, LLLLoc));
+    // Add the default stream if it's missing.
+    if (Args.size() < 6)
+      Args.push_back(new (SemaRef.Context) CXXNullPtrLiteralExpr(
+          SemaRef.Context.NullPtrTy, LLLLoc));
+    return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+                                 /*IsExecConfig=*/true);
+  }
   return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, 
nullptr,
                                /*IsExecConfig=*/true);
 }
@@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
       CalleeTarget == CUDAFunctionTarget::InvalidTarget)
     return CFP_Never;
 
-  // (a) Can't call global from some contexts until we support CUDA's
-  // dynamic parallelism.
+  // (a) Call global from either global or device contexts is allowed as part
+  // of CUDA's dynamic parallelism support.
   if (CalleeTarget == CUDAFunctionTarget::Global &&
       (CallerTarget == CUDAFunctionTarget::Global ||
        CallerTarget == CUDAFunctionTarget::Device))
-    return CFP_Never;
+    return CFP_Native;
 
   // (b) Calling HostDevice is OK for everyone.
   if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
   if (CallerTarget == CUDAFunctionTarget::HostDevice) {
     // It's OK to call a compilation-mode matching function from an HD one.
     if ((getLangOpts().CUDAIsDevice &&
-         CalleeTarget == CUDAFunctionTarget::Device) ||
+         (CalleeTarget == CUDAFunctionTarget::Device ||
+          CalleeTarget == CUDAFunctionTarget::Global)) ||
         (!getLangOpts().CUDAIsDevice &&
          (CalleeTarget == CUDAFunctionTarget::Host ||
           CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1182,14 @@ std::string SemaCUDA::getConfigureFuncName() const {
   return "cudaConfigureCall";
 }
 
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+  return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+  return "cudaLaunchDevice";
+}
+
 // Record any local constexpr variables that are passed one way on the host
 // and another on the device.
 void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 25b89d65847ad..4954c0864b91b 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator 
&D, DeclContext *DC,
   }
 
   if (getLangOpts().CUDA) {
-    IdentifierInfo *II = NewFD->getIdentifier();
-    if (II && II->isStr(CUDA().getConfigureFuncName()) &&
-        !NewFD->isInvalidDecl() &&
-        NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
-      if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
-        Diag(NewFD->getLocation(), diag::err_config_scalar_return)
-            << CUDA().getConfigureFuncName();
-      Context.setcudaConfigureCallDecl(NewFD);
+    if (IdentifierInfo *II = NewFD->getIdentifier()) {
+      if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() 
&&
+          NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+        if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+          Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+              << CUDA().getConfigureFuncName();
+        Context.setcudaConfigureCallDecl(NewFD);
+      }
+      if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+          !NewFD->isInvalidDecl() &&
+          NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+        if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+          Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+              << CUDA().getConfigureFuncName();
+        Context.setcudaGetParameterBufferDecl(NewFD);
+      }
+      if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+          !NewFD->isInvalidDecl() &&
+          NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+        if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+          Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+              << CUDA().getConfigureFuncName();
+        Context.setcudaLaunchDeviceDecl(NewFD);
+      }
     }
   }
 
diff --git a/clang/lib/Serialization/ASTReader.cpp 
b/clang/lib/Serialization/ASTReader.cpp
index 55c52154c4113..5c82cafc49177 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5580,9 +5580,13 @@ void ASTReader::InitializeContext() {
 
   // If there were any CUDA special declarations, deserialize them.
   if (!CUDASpecialDeclRefs.empty()) {
-    assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+    assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
     Context.setcudaConfigureCallDecl(
-                           
cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+        cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+    Context.setcudaGetParameterBufferDecl(
+        cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+    Context.setcudaLaunchDeviceDecl(
+        cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
   }
 
   // Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp 
b/clang/lib/Serialization/ASTWriter.cpp
index 547497cbd87d9..1871e48df35ff 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5706,8 +5706,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema 
&SemaRef) {
     GetDeclRef(SemaRef.getStdAlignValT());
   }
 
-  if (Context.getcudaConfigureCallDecl())
+  if (Context.getcudaConfigureCallDecl() ||
+      Context.getcudaGetParameterBufferDecl() ||
+      Context.getcudaLaunchDeviceDecl()) {
     GetDeclRef(Context.getcudaConfigureCallDecl());
+    GetDeclRef(Context.getcudaGetParameterBufferDecl());
+    GetDeclRef(Context.getcudaLaunchDeviceDecl());
+  }
 
   // Writing all of the known namespaces.
   for (const auto &I : SemaRef.KnownNamespaces)
@@ -5834,19 +5839,19 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
       Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, 
PendingInstantiations);
   }
 
+  auto AddEmittedDeclRefOrZero = [this](RecordData &Refs, Decl *D) {
+    if (!D || !wasDeclEmitted(D))
+      Refs.push_back(0);
+    else
+      AddDeclRef(D, Refs);
+  };
+
   // Write the record containing declaration references of Sema.
   RecordData SemaDeclRefs;
   if (SemaRef.StdNamespace || SemaRef.StdBadAlloc || SemaRef.StdAlignValT) {
-    auto AddEmittedDeclRefOrZero = [this, &SemaDeclRefs](Decl *D) {
-      if (!D || !wasDeclEmitted(D))
-        SemaDeclRefs.push_back(0);
-      else
-        AddDeclRef(D, SemaDeclRefs);
-    };
-
-    AddEmittedDeclRefOrZero(SemaRef.getStdNamespace());
-    AddEmittedDeclRefOrZero(SemaRef.getStdBadAlloc());
-    AddEmittedDeclRefOrZero(SemaRef.getStdAlignValT());
+    AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdNamespace());
+    AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdBadAlloc());
+    AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdAlignValT());
   }
   if (!SemaDeclRefs.empty())
     Stream.EmitRecord(SEMA_DECL_REFS, SemaDeclRefs);
@@ -5862,9 +5867,13 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
 
   // Write the record containing CUDA-specific declaration references.
   RecordData CUDASpecialDeclRefs;
-  if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl();
-      CudaCallDecl && wasDeclEmitted(CudaCallDecl)) {
-    AddDeclRef(CudaCallDecl, CUDASpecialDeclRefs);
+  if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl(),
+      *CudaGetParamDecl = Context.getcudaGetParameterBufferDecl(),
+      *CudaLaunchDecl = Context.getcudaLaunchDeviceDecl();
+      CudaCallDecl || CudaGetParamDecl || CudaLaunchDecl) {
+    AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaCallDecl);
+    AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaGetParamDecl);
+    AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaLaunchDecl);
     Stream.EmitRecord(CUDA_SPECIAL_DECL_REFS, CUDASpecialDeclRefs);
   }
 
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h 
b/clang/test/CodeGenCUDA/Inputs/cuda.h
index e7ad784335027..421fa4dd7dbae 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -72,7 +72,13 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, 
dim3 gridDim,
 extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
                                         dim3 blockDim, void **args,
                                         size_t sharedMem, cudaStream_t stream);
-
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+                                                   void *parameterBuffer,
+                                                   dim3 gridDim, dim3 blockDim,
+                                                   unsigned int sharedMem,
+                                                   cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+                                                   size_t size);
 #endif
 
 extern "C" __device__ int printf(const char*, ...);
diff --git a/clang/test/CodeGenCUDA/device-kernel-call.cu 
b/clang/test/CodeGenCUDA/device-kernel-call.cu
new file mode 100644
index 0000000000000..eff2b37bd298d
--- /dev/null
+++ b/clang/test/CodeGenCUDA/device-kernel-call.cu
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fgpu-rdc 
-emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g2i(
+// CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    ret void
+//
+__global__ void g2(int x) {}
+
+// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g1v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @cudaGetParameterBuffer(i64 noundef 
64, i64 noundef 4) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ne ptr [[CALL]], null
+// CHECK-NEXT:    br i1 [[TMP0]], label %[[DKCALL_CONFIGOK:.*]], label 
%[[DKCALL_END:.*]]
+// CHECK:       [[DKCALL_CONFIGOK]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 0
+// CHECK-NEXT:    store i32 42, ptr [[TMP1]], align 64
+// CHECK-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP]], i32 noundef 1, i32 noundef 1, i32 noundef 1) 
#[[ATTR3]]
+// CHECK-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP1]], i32 noundef 1, i32 noundef 1, i32 noundef 1) 
#[[ATTR3]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @cudaLaunchDevice(ptr noundef 
@_Z2g2i, ptr noundef [[CALL]], ptr noundef byval([[STRUCT_DIM3]]) align 4 
[[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 
noundef 0, ptr noundef null) #[[ATTR3]]
+// CHECK-NEXT:    br label %[[DKCALL_END]]
+// CHECK:       [[DKCALL_END]]:
+// CHECK-NEXT:    ret void
+//
+__global__ void g1(void) {
+  g2<<<1, 1>>>(42);
+}
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h 
b/clang/test/SemaCUDA/Inputs/cuda.h
index 2bf45e03d91c7..de6f7fb635421 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -46,6 +46,13 @@ extern "C" int __cudaPushCallConfiguration(dim3 gridSize, 
dim3 blockSize,
 extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
                                         dim3 blockDim, void **args,
                                         size_t sharedMem, cudaStream_t stream);
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+                                                   void *parameterBuffer,
+                                                   dim3 gridDim, dim3 blockDim,
+                                                   unsigned int sharedMem,
+                                                   cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+                                                   size_t size);
 #endif
 
 // Host- and device-side placement new overloads.
diff --git a/clang/test/SemaCUDA/call-kernel-from-kernel.cu 
b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
index 5f8832f3cd070..01dba44339520 100644
--- a/clang/test/SemaCUDA/call-kernel-from-kernel.cu
+++ b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
@@ -1,9 +1,12 @@
 // RUN: %clang_cc1 %s --std=c++11 -triple nvptx -o - \
 // RUN:   -verify -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note
+// RUN: %clang_cc1 %s --std=c++11 -fgpu-rdc -triple nvptx -o - \
+// RUN:   -verify=rdc -fcuda-is-device -fsyntax-only 
-verify-ignore-unexpected=note
+// rdc-no-diagnostics
 
 #include "Inputs/cuda.h"
 
 __global__ void kernel1();
 __global__ void kernel2() {
-  kernel1<<<1,1>>>(); // expected-error {{reference to __global__ function 
'kernel1' in __global__ function}}
+  kernel1<<<1,1>>>(); // expected-error {{kernel launch from __device__ or 
__global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
 }
diff --git a/clang/test/SemaCUDA/device-kernel-call.cu 
b/clang/test/SemaCUDA/device-kernel-call.cu
new file mode 100644
index 0000000000000..fea6deac02e55
--- /dev/null
+++ b/clang/test/SemaCUDA/device-kernel-call.cu
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -fcuda-is-device -verify=nordc %s
+// RUN: %clang_cc1 -fcuda-is-device -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -x hip -fcuda-is-device -verify=hip %s
+
+// rdc-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+__global__ void g2(int x) {}
+
+// CHECK-LABEL: define{{.*}}g1
+__global__ void g1(void) {
+  // CHECK: [[CONFIG:%.*]] = 
call{{.*}}_Z22cudaGetParameterBuffermm(i64{{.*}}64, i64{{.*}}4)
+  // CHECK-NEXT: [[FLAG:%.*]] = icmp ne ptr [[CONFIG]], null
+  // CHECK-NEXT: br i1 [[FLAG]], label %[[THEN:.*]], label %[[ENDIF:.*]]
+  // CHECK: [[THEN]]:
+  // CHECK-NEXT: [[PPTR:%.*]] = getelementptr{{.*}}i8, ptr [[CONFIG]], i64 0
+  // CHECK-NEXT: store i32 42, ptr [[PPTR]]
+  // CHECK: = call{{.*}} i32 
@_Z16cudaLaunchDevicePvS_4dim3S0_jP10cudaStream(ptr{{.*}} @_Z2g2i, ptr{{.*}} 
[[CONFIG]],
+  g2<<<1, 1>>>(42);
+  // nordc-error@-1 {{kernel launch from __device__ or __global__ function 
requires relocatable device code (i.e. requires -fgpu-rdc)}}
+  // hip-error@-2 {{device-side kernel call/launch is not supported}}
+}
diff --git a/clang/test/SemaCUDA/function-overload.cu 
b/clang/test/SemaCUDA/function-overload.cu
index 3d05839af7528..11f84a912ea7b 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -91,10 +91,7 @@ __host__ HostReturnTy h() { return HostReturnTy(); }
 // devdefer-note@-4 1+ {{candidate function not viable: call to __host__ 
function from __global__ function}}
 
 __global__ void g() {}
-// dev-note@-1 1+ {{'g' declared here}}
-// devdefer-note@-2 1+ {{candidate function not viable: call to __global__ 
function from __device__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __global__ 
function from __host__ __device__ function}}
-// devdefer-note@-4 1+ {{candidate function not viable: call to __global__ 
function from __global__ function}}
 
 extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
 // host-note@-1 1+ {{'cd' declared here}}
@@ -144,9 +141,9 @@ __device__ void devicef() {
   DeviceFnPtr fp_cdh = cdh;
   DeviceReturnTy ret_cdh = cdh();
 
-  GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in 
__device__ function}}
-  g(); // devdefer-error {{no matching function for call to 'g'}}
-  g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in 
__device__ function}}
+  GlobalFnPtr fp_g = g;
+  g(); // expected-error {{call to global function 'g' not configured}}
+  g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or 
__global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
 }
 
 __global__ void globalf() {
@@ -165,9 +162,9 @@ __global__ void globalf() {
   DeviceFnPtr fp_cdh = cdh;
   DeviceReturnTy ret_cdh = cdh();
 
-  GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in 
__global__ function}}
-  g(); // devdefer-error {{no matching function for call to 'g'}}
-  g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in 
__global__ function}}
+  GlobalFnPtr fp_g = g;
+  g(); // expected-error {{call to global function 'g' not configured}}
+  g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or 
__global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
 }
 
 __host__ __device__ void hostdevicef() {
@@ -199,20 +196,13 @@ __host__ __device__ void hostdevicef() {
   CurrentReturnTy ret_cdh = cdh();
 
   GlobalFnPtr fp_g = g;
-#if defined(__CUDA_ARCH__)
-  // expected-error@-2 {{reference to __global__ function 'g' in __host__ 
__device__ function}}
-#endif
 
   g();
-#if defined (__CUDA_ARCH__)
-  // expected-error@-2 {{reference to __global__ function 'g' in __host__ 
__device__ function}}
-#else
-  // expected-error@-4 {{call to global function 'g' not configured}}
-#endif
+  // expected-error@-1 {{call to global function 'g' not configured}}
 
   g<<<0,0>>>();
 #if defined(__CUDA_ARCH__)
-  // expected-error@-2 {{reference to __global__ function 'g' in __host__ 
__device__ function}}
+  // expected-error@-2 {{kernel launch from __device__ or __global__ function 
requires relocatable device code (i.e. requires -fgpu-rdc)}}
 #endif
 }
 
diff --git a/clang/test/SemaCUDA/function-target.cu 
b/clang/test/SemaCUDA/function-target.cu
index 64444b6676248..66704a320cee1 100644
--- a/clang/test/SemaCUDA/function-target.cu
+++ b/clang/test/SemaCUDA/function-target.cu
@@ -24,11 +24,11 @@ __host__ void h1(void) {
 __host__ void d1h(void); // expected-note {{candidate function not viable: 
call to __host__ function from __device__ function}}
 __device__ void d1d(void);
 __host__ __device__ void d1hd(void);
-__global__ void d1g(void); // dev-note {{'d1g' declared here}}
+__global__ void d1g(void);
 
 __device__ void d1(void) {
   d1h(); // expected-error {{no matching function}}
   d1d();
   d1hd();
-  d1g<<<1, 1>>>(); // dev-error {{reference to __global__ function 'd1g' in 
__device__ function}}
+  d1g<<<1, 1>>>(); // expected-error {{kernel launch from __device__ or 
__global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
 }
diff --git a/clang/test/SemaCUDA/reference-to-kernel-fn.cu 
b/clang/test/SemaCUDA/reference-to-kernel-fn.cu
index 70a1cda6ab0c8..bdb70fc8b55d1 100644
--- a/clang/test/SemaCUDA/reference-to-kernel-fn.cu
+++ b/clang/test/SemaCUDA/reference-to-kernel-fn.cu
@@ -8,6 +8,7 @@
 // device-side kernel launches.)
 
 // host-no-diagnostics
+// dev-no-diagnostics
 
 #include "Inputs/cuda.h"
 
@@ -19,11 +20,10 @@ typedef void (*fn_ptr_t)();
 
 __host__ __device__ fn_ptr_t get_ptr_hd() {
   return kernel;
-  // dev-error@-1 {{reference to __global__ function}}
 }
 __host__ fn_ptr_t get_ptr_h() {
   return kernel;
 }
 __device__ fn_ptr_t get_ptr_d() {
-  return kernel;  // dev-error {{reference to __global__ function}}
+  return kernel;
 }
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp 
b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 58eb671c61989..07fa67f9b956c 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/Version.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -165,6 +166,41 @@ void diagnosticHandler(const DiagnosticInfo &DI) {
   }
 }
 
+// Check if the archive has any object file with fat binary.
+bool hasFatBinary(const Archive &LibFile) {
+  Error Err = Error::success();
+  for (auto &C : LibFile.children(Err)) {
+    auto ChildBufferOrErr = C.getMemoryBufferRef();
+    if (!ChildBufferOrErr)
+      return false;
+    auto ObjFileOrErr = ObjectFile::createObjectFile(*ChildBufferOrErr);
+    if (!ObjFileOrErr)
+      return false;
+    const auto &Obj = **ObjFileOrErr;
+    // Skip device object files.
+    if (Obj.getArch() == Triple::nvptx || Obj.getArch() == Triple::nvptx64)
+      return false;
+    // For host object files, search for the fat binary section.
+    for (const auto &Sec : Obj.sections()) {
+      auto NameOrErr = Sec.getName();
+      if (!NameOrErr)
+        continue;
+      // Search for fat binary sections.
+      if (*NameOrErr != "__nv_relfatbin" && *NameOrErr != ".nv_fatbin")
+        continue;
+      auto ContentOrErr = Sec.getContents();
+      if (!ContentOrErr ||
+          identify_magic(*ContentOrErr) != file_magic::cuda_fatbinary)
+        continue;
+      return true;
+    }
+  }
+  // Check err to ensure it's checked.
+  if (Err)
+    return false;
+  return false;
+}
+
 Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix,
                                    StringRef Extension) {
   SmallString<128> OutputFile;
@@ -487,6 +523,9 @@ Expected<SmallVector<StringRef>> getInput(const ArgList 
&Args) {
   for (const opt::Arg *Arg : Args.filtered(OPT_library_path))
     LibraryPaths.push_back(Arg->getValue());
 
+  // Archives (with fatbin) forwarded to nvlink.
+  SmallVector<const char *> ForwardArchives;
+
   bool WholeArchive = false;
   SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles;
   for (const opt::Arg *Arg : Args.filtered(
@@ -525,6 +564,11 @@ Expected<SmallVector<StringRef>> getInput(const ArgList 
&Args) {
           object::Archive::create(Buffer);
       if (!LibFile)
         return LibFile.takeError();
+      // Skip extracting archives with fat binaries. Forward them to nvlink.
+      if (hasFatBinary(**LibFile)) {
+        ForwardArchives.emplace_back(Args.MakeArgString(*Filename));
+        break;
+      }
       Error Err = Error::success();
       for (auto Child : (*LibFile)->children(Err)) {
         auto ChildBufferOrErr = Child.getMemoryBufferRef();
@@ -687,6 +731,8 @@ Expected<SmallVector<StringRef>> getInput(const ArgList 
&Args) {
       return E;
     Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
   }
+  // Append achives to be forwarded.
+  append_range(Files, ForwardArchives);
 
   return Files;
 }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CUDA] Add device-side kernel launch support (PR #165519)

Reply via email to