The changes no longer depend on external ptxwrap tool.
CUDA runtime support in CGCUDANV.cpp now generates per-module
constructor/destructor to load and initialize GPU code.
http://reviews.llvm.org/D8463
Files:
include/clang/Driver/Action.h
include/clang/Driver/CC1Options.td
include/clang/Driver/Driver.h
include/clang/Driver/Options.td
include/clang/Driver/Types.def
include/clang/Frontend/CodeGenOptions.h
lib/CodeGen/CGCUDANV.cpp
lib/CodeGen/CGCUDARuntime.cpp
lib/CodeGen/CGCUDARuntime.h
lib/CodeGen/CodeGenFunction.cpp
lib/CodeGen/CodeGenModule.cpp
lib/Driver/Action.cpp
lib/Driver/Driver.cpp
lib/Driver/ToolChain.cpp
lib/Driver/ToolChains.cpp
lib/Driver/ToolChains.h
lib/Driver/Tools.cpp
lib/Driver/Tools.h
lib/Driver/Types.cpp
lib/Frontend/CompilerInvocation.cpp
test/Driver/cuda-options.cu
test/Index/attributes-cuda.cu
tools/libclang/CIndex.cpp
unittests/ASTMatchers/ASTMatchersTest.h
EMAIL PREFERENCES
http://reviews.llvm.org/settings/panel/emailpreferences/
Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -41,6 +41,8 @@
enum ActionClass {
InputClass = 0,
BindArchClass,
+ CudaDeviceClass,
+ CudaHostClass,
PreprocessJobClass,
PrecompileJobClass,
AnalyzeJobClass,
@@ -133,6 +135,36 @@
}
};
+class CudaDeviceAction : public Action {
+ virtual void anchor();
+ /// GPU architecture to bind
+ const char *GpuArchName;
+
+public:
+ CudaDeviceAction(std::unique_ptr<Action> Input, const char *_ArchName);
+
+ const char *getGpuArchName() const { return GpuArchName; }
+
+ static bool classof(const Action *A) {
+ return A->getKind() == CudaDeviceClass;
+ }
+};
+
+class CudaHostAction : public Action {
+ virtual void anchor();
+ ActionList DeviceActions;
+
+public:
+ CudaHostAction(std::unique_ptr<Action> Input,
+ const ActionList &DeviceActions);
+ ~CudaHostAction() override;
+
+ ActionList &getDeviceActions() { return DeviceActions; }
+ const ActionList &getDeviceActions() const { return DeviceActions; }
+
+ static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+};
+
class JobAction : public Action {
virtual void anchor();
protected:
Index: include/clang/Driver/CC1Options.td
===================================================================
--- include/clang/Driver/CC1Options.td
+++ include/clang/Driver/CC1Options.td
@@ -608,6 +608,8 @@
// CUDA Options
//===----------------------------------------------------------------------===//
+def cuda_include_gpucode : Separate<["-"], "cuda-include-gpucode">,
+ HelpText<"Incorporate CUDA device-side code.">;
def fcuda_is_device : Flag<["-"], "fcuda-is-device">,
HelpText<"Generate code for CUDA device">;
def fcuda_allow_host_calls_from_host_device : Flag<["-"],
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -409,6 +409,9 @@
///
/// Will cache ToolChains for the life of the driver object, and create them
/// on-demand.
+ const ToolChain &getTargetToolChain(const llvm::opt::ArgList &Args,
+ llvm::Triple &Target) const;
+
const ToolChain &getToolChain(const llvm::opt::ArgList &Args,
StringRef DarwinArchName = "") const;
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -453,6 +453,10 @@
Group<f_Group>;
def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused]>;
def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
+def fcuda_no_device : Flag<["-"], "fcuda-no-device">,
+ HelpText<"Disable device-side CUDA compilation">;
+def fcuda_no_host : Flag<["-"], "fcuda-no-host">,
+ HelpText<"Disable host-side CUDA compilation">;
def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
@@ -1064,6 +1068,11 @@
def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>;
def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
+def gpu_architecture : Separate<["-"], "gpu-architecture">,
+ Flags<[DriverOption, CC1Option, HelpHidden]>,
+ HelpText<"CUDA GPU architecture">;
+def gpu_architecture_EQ : Joined<["--"], "gpu-architecture=">,
+ Flags<[DriverOption]>, Alias<gpu_architecture>;
def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
HelpText<"Display available options">;
Index: include/clang/Driver/Types.def
===================================================================
--- include/clang/Driver/Types.def
+++ include/clang/Driver/Types.def
@@ -44,6 +44,7 @@
TYPE("cl", CL, PP_C, "cl", "u")
TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u")
TYPE("cuda", CUDA, PP_CUDA, "cu", "u")
+TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "")
TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u")
TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u")
TYPE("objective-c", ObjC, PP_ObjC, "m", "u")
Index: include/clang/Frontend/CodeGenOptions.h
===================================================================
--- include/clang/Frontend/CodeGenOptions.h
+++ include/clang/Frontend/CodeGenOptions.h
@@ -160,6 +160,9 @@
/// Name of the profile file to use as input for -fprofile-instr-use
std::string InstrProfileInput;
+ /// List of CUDA GPU code blobs to incorporate
+ std::vector<std::string> CudaGpuCodeFiles;
+
/// Regular expression to select optimizations for which we should enable
/// optimization remarks. Transformation passes whose name matches this
/// expression (and support this feature), will emit a diagnostic
Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Verifier.h"
#include <vector>
using namespace clang;
@@ -30,29 +31,65 @@
class CGNVCUDARuntime : public CGCUDARuntime {
private:
- llvm::Type *IntTy, *SizeTy;
- llvm::PointerType *CharPtrTy, *VoidPtrTy;
+ static const int FatbinWrapperMagic = 0x466243b1;
+ static const int FatbinWrapperVersion = 1;
+
+ llvm::Type *IntTy, *SizeTy, *VoidTy;
+ llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
+ llvm::Constant *Zeros[2];
+ llvm::StructType *FatbinWrapperTy;
+
+ llvm::LLVMContext &VMContext;
+ llvm::Module &TheModule;
llvm::Constant *getSetupArgumentFn() const;
llvm::Constant *getLaunchFn() const;
+ llvm::Constant *getRegisterFunctionFn() const;
+
+ llvm::Function *makeRegisterKernelsFn();
+
+ /// Helper function that generates a constant string and returns a pointer to
+ /// the start of the string. The result of this function can be used anywhere
+ /// where the C code specifies const char*.
+ llvm::Constant *MakeConstantString(const std::string &Str,
+ const std::string &Name = "",
+ unsigned Alignment = 0) {
+ llvm::Constant *ConstStr =
+ CGM.GetAddrOfConstantCString(Str, Name.c_str(), Alignment);
+ return llvm::ConstantExpr::getGetElementPtr(ConstStr, Zeros);
+ }
+
+ void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
public:
CGNVCUDARuntime(CodeGenModule &CGM);
- void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
+ llvm::Function *ModuleCtorFunction() override;
+ llvm::Function *ModuleDtorFunction() override;
};
}
-CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM) {
+CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
+ : CGCUDARuntime(CGM), VMContext(CGM.getLLVMContext()),
+ TheModule(CGM.getModule()) {
CodeGen::CodeGenTypes &Types = CGM.getTypes();
ASTContext &Ctx = CGM.getContext();
IntTy = Types.ConvertType(Ctx.IntTy);
SizeTy = Types.ConvertType(Ctx.getSizeType());
+ VoidTy = llvm::Type::getVoidTy(VMContext);
CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
+ VoidPtrPtrTy = VoidPtrTy->getPointerTo();
+
+ Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
+ Zeros[1] = Zeros[0];
+
+ // struct { int magic, int version, void * gpu_blob, void * dont_care };
+ FatbinWrapperTy =
+ llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr);
}
llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -68,11 +105,26 @@
llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
// cudaError_t cudaLaunch(char *)
+ return CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
+}
+
+llvm::Constant *CGNVCUDARuntime::getRegisterFunctionFn() const {
+ // void __cudaRegisterFunction(void **, const char *, char *, const char *,
+ // int, uint3, uint3, dim3, dim3, int)
std::vector<llvm::Type*> Params;
+ Params.push_back(VoidPtrPtrTy);
Params.push_back(CharPtrTy);
- return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
- Params, false),
- "cudaLaunch");
+ Params.push_back(CharPtrTy);
+ Params.push_back(CharPtrTy);
+ Params.push_back(IntTy);
+ Params.push_back(VoidPtrTy);
+ Params.push_back(VoidPtrTy);
+ Params.push_back(VoidPtrTy);
+ Params.push_back(VoidPtrTy);
+ Params.push_back(IntTy->getPointerTo());
+ return CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(IntTy, Params, false), "__cudaRegisterFunction");
}
void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
@@ -87,8 +139,7 @@
assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
}
- llvm::StructType *ArgStackTy = llvm::StructType::get(
- CGF.getLLVMContext(), ArgTypes);
+ llvm::StructType *ArgStackTy = llvm::StructType::get(VMContext, ArgTypes);
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
@@ -120,6 +171,144 @@
CGF.EmitBlock(EndBlock);
}
+// Creates internal function to register all kernel stubs generated in this
+// module.
+// void .cuda_register_kernels(void** GpuBlobHandle) {
+// // for (Kernel : EmittedKernels) {
+// __cudaRegisterFunction(GpuBlobHandle,Kernel)
+// // }
+// }
+//
+llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+ llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
+ llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+ llvm::GlobalValue::InternalLinkage, ".cuda_register_kernels", &TheModule);
+ llvm::BasicBlock *EntryBB =
+ llvm::BasicBlock::Create(VMContext, "entry", RegisterKernelsFunc);
+ CGBuilderTy Builder(VMContext);
+ Builder.SetInsertPoint(EntryBB);
+
+ llvm::Constant *RegisterFunc = getRegisterFunctionFn();
+ llvm::Argument &BlobHandlePtr = *RegisterKernelsFunc->arg_begin();
+ for (llvm::Function *Kernel : EmittedKernels) {
+ llvm::Constant *KernelName = MakeConstantString(Kernel->getName());
+ llvm::Value *args[] = {
+ &BlobHandlePtr, // pointer to fatbin handler pointer
+ // Builder.CreatePointerCast(nullptr, CharPtrTy), // kernel stub addr
+ Builder.CreateBitCast(Kernel, VoidPtrTy), // kernel stub addr
+ KernelName, // kernel name string.
+ KernelName, // const name string
+ llvm::ConstantInt::get(IntTy, -1),
+ llvm::ConstantPointerNull::get(VoidPtrTy),
+ llvm::ConstantPointerNull::get(VoidPtrTy),
+ llvm::ConstantPointerNull::get(VoidPtrTy),
+ llvm::ConstantPointerNull::get(VoidPtrTy),
+ llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
+
+ Builder.CreateCall(RegisterFunc, args);
+ }
+
+ Builder.CreateRetVoid();
+
+ llvm::verifyFunction(*RegisterKernelsFunc);
+ return RegisterKernelsFunc;
+}
+
+// Creates a global constructor function for the module:
+// void .cuda_module_ctor(void*) {
+// // for (GpuCodeBlob : GpuCodeBlobs) {
+// Handle = __cudaRegisterFatBinary(GpuCodeBlob);
+// .cuda_register_kernels(Handle);
+// // }
+// }
+llvm::Function *CGNVCUDARuntime::ModuleCtorFunction() {
+ llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+ llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(VoidPtrPtrTy, FatbinWrapperTy->getPointerTo(),
+ false),
+ "__cudaRegisterFatBinary");
+
+ llvm::Function *ModuleCtorFunc = llvm::Function::Create(
+ llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+ llvm::GlobalValue::InternalLinkage, ".cuda_module_ctor", &TheModule);
+ llvm::BasicBlock *CtorEntryBB =
+ llvm::BasicBlock::Create(VMContext, "entry", ModuleCtorFunc);
+ CGBuilderTy CtorBuilder(VMContext);
+
+ CtorBuilder.SetInsertPoint(CtorEntryBB);
+
+ for (const std::string &GpuCodeFileName :
+ CGM.getCodeGenOpts().CudaGpuCodeFiles) {
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CodeOrErr =
+ llvm::MemoryBuffer::getFileOrSTDIN(GpuCodeFileName);
+ if (std::error_code EC = CodeOrErr.getError()) {
+ CGM.getDiags().Report(diag::err_cannot_open_file) << GpuCodeFileName
+ << EC.message();
+ continue;
+ }
+
+ // Create initialized wrapper structure that points to the loaded GPU blob.
+ llvm::Constant *Values[4] = {
+ llvm::ConstantInt::get(IntTy, FatbinWrapperMagic),
+ llvm::ConstantInt::get(IntTy, FatbinWrapperVersion),
+ MakeConstantString(CodeOrErr.get()->getBuffer(), "", 16),
+ llvm::ConstantPointerNull::get(VoidPtrTy)};
+ llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
+ TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
+ llvm::ConstantStruct::get(FatbinWrapperTy, Values),
+ ".cuda_fatbin_wrapper");
+ FatbinWrapper->setAlignment(8);
+
+ // FatbinHandle == __cudaRegisterFatBinary(&FatbinWrapper);
+ llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+ RegisterFatbinFunc,
+ CtorBuilder.CreateBitCast(FatbinWrapper,
+ FatbinWrapperTy->getPointerTo()));
+
+ llvm::GlobalVariable *FatbinHandle = new llvm::GlobalVariable(
+ TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+ llvm::ConstantPointerNull::get(VoidPtrPtrTy), ".cuda_fatbin_handle");
+ CtorBuilder.CreateStore(RegisterFatbinCall, FatbinHandle, false);
+ // Call .cuda_register_kernels(FatbinHandle);
+ CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+ // Save FatbinHandle so we can unregister it in destructor.
+ FatbinHandles.push_back(FatbinHandle);
+ }
+ CtorBuilder.CreateRetVoid();
+ llvm::verifyFunction(*ModuleCtorFunc);
+ return ModuleCtorFunc;
+}
+
+// Creates a global destructor function that unregisters all GPU code blobs
+// registered by constructor.
+// void .cuda_module_dtor(void*) {
+// // for(Handle: RegisteredHandles) {
+// __cudaUnregisterFatBinary(Handle);
+// // }
+// }
+llvm::Function *CGNVCUDARuntime::ModuleDtorFunction() {
+ llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+ "__cudaUnregisterFatBinary");
+
+ llvm::Function *ModuleDtorFunc = llvm::Function::Create(
+ llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+ llvm::GlobalValue::InternalLinkage, ".cuda_module_dtor", &TheModule);
+ llvm::BasicBlock *DtorEntryBB =
+ llvm::BasicBlock::Create(VMContext, "entry", ModuleDtorFunc);
+ CGBuilderTy DtorBuilder(VMContext);
+ DtorBuilder.SetInsertPoint(DtorEntryBB);
+
+ for (llvm::GlobalVariable *FatbinHandle : FatbinHandles) {
+ DtorBuilder.CreateCall(UnregisterFatbinFunc,
+ DtorBuilder.CreateLoad(FatbinHandle, false));
+ }
+
+ DtorBuilder.CreateRetVoid();
+ llvm::verifyFunction(*ModuleDtorFunc);
+ return ModuleDtorFunc;
+}
+
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
return new CGNVCUDARuntime(CGM);
}
Index: lib/CodeGen/CGCUDARuntime.cpp
===================================================================
--- lib/CodeGen/CGCUDARuntime.cpp
+++ lib/CodeGen/CGCUDARuntime.cpp
@@ -53,3 +53,9 @@
return RValue::get(nullptr);
}
+
+void CGCUDARuntime::EmitDeviceStub(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ EmittedKernels.push_back(CGF.CurFn);
+ EmitDeviceStubBody(CGF, Args);
+}
Index: lib/CodeGen/CGCUDARuntime.h
===================================================================
--- lib/CodeGen/CGCUDARuntime.h
+++ lib/CodeGen/CGCUDARuntime.h
@@ -16,6 +16,13 @@
#ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
#define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+class Function;
+class GlobalVariable;
+}
+
namespace clang {
class CUDAKernelCallExpr;
@@ -32,17 +39,25 @@
protected:
CodeGenModule &CGM;
+ llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
+ llvm::SmallVector<llvm::GlobalVariable *, 16> FatbinHandles;
+
public:
CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
virtual ~CGCUDARuntime();
virtual RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue);
-
+
+ virtual void EmitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args);
+
+ virtual llvm::Function *ModuleCtorFunction() = 0;
+ virtual llvm::Function *ModuleDtorFunction() = 0;
+
+private:
virtual void EmitDeviceStubBody(CodeGenFunction &CGF,
FunctionArgList &Args) = 0;
-
};
/// Creates an instance of a CUDA runtime class.
Index: lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- lib/CodeGen/CodeGenFunction.cpp
+++ lib/CodeGen/CodeGenFunction.cpp
@@ -872,7 +872,7 @@
else if (getLangOpts().CUDA &&
!getLangOpts().CUDAIsDevice &&
FD->hasAttr<CUDAGlobalAttr>())
- CGM.getCUDARuntime().EmitDeviceStubBody(*this, Args);
+ CGM.getCUDARuntime().EmitDeviceStub(*this, Args);
else if (isa<CXXConversionDecl>(FD) &&
cast<CXXConversionDecl>(FD)->isLambdaToBlockPointerConversion()) {
// The lambda conversion to block pointer is special; the semantics can't be
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -350,6 +350,13 @@
if (ObjCRuntime)
if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction())
AddGlobalCtor(ObjCInitFunction);
+ if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice &&
+ CUDARuntime) {
+ if (llvm::Function *CudaCtorFunction = CUDARuntime->ModuleCtorFunction())
+ AddGlobalCtor(CudaCtorFunction);
+ if (llvm::Function *CudaDtorFunction = CUDARuntime->ModuleDtorFunction())
+ AddGlobalDtor(CudaDtorFunction);
+ }
if (PGOReader && PGOStats.hasDiagnostics())
PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName);
EmitCtorList(GlobalCtors, "llvm.global_ctors");
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -24,6 +24,8 @@
switch (AC) {
case InputClass: return "input";
case BindArchClass: return "bind-arch";
+ case CudaDeviceClass: return "cuda-device";
+ case CudaHostClass: return "cuda-host";
case PreprocessJobClass: return "preprocessor";
case PrecompileJobClass: return "precompiler";
case AnalyzeJobClass: return "analyzer";
@@ -53,6 +55,24 @@
const char *_ArchName)
: Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {}
+void CudaDeviceAction::anchor() {}
+
+CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
+ const char *_ArchName)
+ : Action(CudaDeviceClass, std::move(Input)), GpuArchName(_ArchName) {}
+
+void CudaHostAction::anchor() {}
+
+CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input,
+ const ActionList &_DeviceActions)
+ : Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {}
+
+CudaHostAction::~CudaHostAction() {
+ for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie;
+ ++it)
+ delete *it;
+}
+
void JobAction::anchor() {}
JobAction::JobAction(ActionClass Kind, std::unique_ptr<Action> Input,
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -181,9 +181,10 @@
FinalPhase = phases::Backend;
// -c only runs up to the assembler.
- } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) {
+ } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) ||
+ (PhaseArg = DAL.getLastArg(options::OPT_fcuda_no_device)) ||
+ (PhaseArg = DAL.getLastArg(options::OPT_fcuda_no_host))) {
FinalPhase = phases::Assemble;
-
// Otherwise do everything.
} else
FinalPhase = phases::Link;
@@ -819,7 +820,25 @@
}
static unsigned PrintActions1(const Compilation &C, Action *A,
- std::map<Action*, unsigned> &Ids) {
+ std::map<Action *, unsigned> &Ids);
+
+static std::string PrintActionList(const Compilation &C, ActionList &AL,
+ std::map<Action *, unsigned> &Ids) {
+ std::string str;
+ llvm::raw_string_ostream os(str);
+ os << "{";
+ for (Action::iterator it = AL.begin(), ie = AL.end(); it != ie;) {
+ os << PrintActions1(C, *it, Ids);
+ ++it;
+ if (it != ie)
+ os << ", ";
+ }
+ os << "}";
+ return str;
+}
+
+static unsigned PrintActions1(const Compilation &C, Action *A,
+ std::map<Action *, unsigned> &Ids) {
if (Ids.count(A))
return Ids[A];
@@ -832,15 +851,14 @@
} else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
os << '"' << BIA->getArchName() << '"'
<< ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}";
+ } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+ os << '"' << CDA->getGpuArchName() << '"' << ", {"
+ << PrintActions1(C, *CDA->begin(), Ids) << "}";
+ } else if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+ os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
+ << ", gpucode " << PrintActionList(C, CHA->getDeviceActions(), Ids);
} else {
- os << "{";
- for (Action::iterator it = A->begin(), ie = A->end(); it != ie;) {
- os << PrintActions1(C, *it, Ids);
- ++it;
- if (it != ie)
- os << ", ";
- }
- os << "}";
+ os << PrintActionList(C, A->getInputs(), Ids);
}
unsigned Id = Ids.size();
@@ -1149,6 +1167,69 @@
}
}
+// For eash unique --gpu-architecture argument creates a TY_CUDA_DEVICE input
+// action and then wraps each in CudaDeviceAction paired with appropriate GPU
+// arch name. If we're only building device-side code, each action remains
+// independent. Otherwise we pass device-side actions as inputs to a new
+// CudaHostAction which combines both host and device side actions.
+static std::unique_ptr<Action>
+BuildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
+ const Arg *InputArg, const types::ID InputType,
+ std::unique_ptr<Action> Current, ActionList &Actions) {
+
+ assert(InputType == types::TY_CUDA &&
+ "CUDA Actions only apply to CUDA inputs.");
+
+ SmallVector<const char *, 4> GpuArchList;
+ llvm::StringSet<> GpuArchNames;
+ for (Arg *A : Args) {
+ if (A->getOption().matches(options::OPT_gpu_architecture)) {
+ A->claim();
+ if (GpuArchNames.insert(A->getValue()).second)
+ GpuArchList.push_back(A->getValue());
+ }
+ }
+
+ if (GpuArchList.empty())
+ GpuArchList.push_back("sm_20");
+
+ Driver::InputList CudaDeviceInputs;
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
+
+ ActionList CudaDeviceActions;
+ D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+ assert(GpuArchList.size() == CudaDeviceActions.size() &&
+ "Failed to create actions for all devices");
+
+ bool PartialCompilation = false;
+ bool DeviceOnlyCompilation = Args.hasArg(options::OPT_fcuda_no_host);
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) {
+ if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) {
+ PartialCompilation = true;
+ break;
+ }
+ }
+
+ if (PartialCompilation || DeviceOnlyCompilation) {
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ Actions.push_back(new CudaDeviceAction(
+ std::unique_ptr<Action>(CudaDeviceActions[i]), GpuArchList[i]));
+ if (DeviceOnlyCompilation)
+ Current.reset(nullptr);
+ return Current;
+ } else {
+ ActionList CudaDeviceJobActions;
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ CudaDeviceJobActions.push_back(new CudaDeviceAction(
+ std::unique_ptr<Action>(CudaDeviceActions[i]), GpuArchList[i]));
+
+ std::unique_ptr<Action> HostAction(
+ new CudaHostAction(std::move(Current), CudaDeviceJobActions));
+ return HostAction;
+ }
+}
+
void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
const InputList &Inputs, ActionList &Actions) const {
llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
@@ -1251,8 +1332,26 @@
// Build the pipeline for this file.
std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
- for (SmallVectorImpl<phases::ID>::iterator
- i = PL.begin(), e = PL.end(); i != e; ++i) {
+ phases::ID CudaInjectionPhase;
+ if (isSaveTempsEnabled()) {
+ // All phases are done independently, inject GPU blobs during compilation
+ // phase as that's where we generate glue code to init them.
+ CudaInjectionPhase = phases::Compile;
+ } else {
+ // Assumes that clang does everything up until linking phase, so we inject
+ // cuda device actions at the last step before linking. Otherwise CUDA
+ // host action forces preprocessor into a separate invocation.
+ if (FinalPhase == phases::Link) {
+ for (auto i = PL.begin(), e = PL.end(); i != e; ++i) {
+ auto next = i + 1;
+ if (next != e && *next == phases::Link)
+ CudaInjectionPhase = *i;
+ }
+ } else
+ CudaInjectionPhase = FinalPhase;
+ }
+ for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
+ i != e; ++i) {
phases::ID Phase = *i;
// We are done if this step is past what the user requested.
@@ -1274,6 +1373,15 @@
// Otherwise construct the appropriate action.
Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
+
+ if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
+ !Args.hasArg(options::OPT_fcuda_no_device)) {
+ Current = BuildCudaActions(*this, TC, Args, InputArg, InputType,
+ std::move(Current), Actions);
+ if (!Current)
+ break;
+ }
+
if (Current->getType() == types::TY_Nothing)
break;
}
@@ -1403,10 +1511,14 @@
if (A->getType() != types::TY_Nothing)
++NumOutputs;
+#if DISABLED_FOR_NOW
+ // TODO: Cuda compilation has more than one input. Need to figure out how to
+ // detect whether it's a cuda compilation.
if (NumOutputs > 1) {
Diag(clang::diag::err_drv_output_argument_with_multiple_files);
FinalOutput = nullptr;
}
+#endif
}
// Collect the list of architectures.
@@ -1521,7 +1633,13 @@
if (isa<BackendJobAction>(JA)) {
// Check if the compiler supports emitting LLVM IR.
assert(Inputs->size() == 1);
- JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin());
+ JobAction *CompileJA;
+ // Extract real host action, if it's a CudaHostAction.
+ if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
+ CompileJA = cast<CompileJobAction>(*CudaHA->begin());
+ else
+ CompileJA = cast<CompileJobAction>(*Inputs->begin());
+
const Tool *Compiler = TC->SelectTool(*CompileJA);
if (!Compiler)
return nullptr;
@@ -1549,6 +1667,10 @@
return ToolForJob;
}
+static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple,
+ const ArgList &Args,
+ StringRef DarwinArchName);
+
void Driver::BuildJobsForAction(Compilation &C,
const Action *A,
const ToolChain *TC,
@@ -1559,6 +1681,20 @@
InputInfo &Result) const {
llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
+ InputInfoList CudaDeviceInputInfos;
+ if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+ InputInfo II;
+ // Append outputs of device jobs to the input list.
+ for (const Action *DA : CHA->getDeviceActions()) {
+ BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+ /*MultipleArchs*/ false, LinkingOutput, II);
+ CudaDeviceInputInfos.push_back(II);
+ }
+ // Override current action with a real host compile action and continue
+ // processing it.
+ A = *CHA->begin();
+ }
+
if (const InputAction *IA = dyn_cast<InputAction>(A)) {
// FIXME: It would be nice to not claim this here; maybe the old scheme of
// just using Args was better?
@@ -1581,8 +1717,21 @@
else
TC = &C.getDefaultToolChain();
- BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(),
- AtTopLevel, MultipleArchs, LinkingOutput, Result);
+ BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
+ MultipleArchs, LinkingOutput, Result);
+ return;
+ }
+
+ if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+ const ToolChain *TC;
+ const char *ArchName = CDA->getGpuArchName();
+ llvm::Triple HostTriple =
+ computeTargetTriple(DefaultTargetTriple, C.getArgs(), "");
+ llvm::Triple TargetTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
+ : "nvptx-nvidia-cuda");
+ TC = &getTargetToolChain(C.getArgs(), TargetTriple);
+ BuildJobsForAction(C, *CDA->begin(), TC, ArchName, AtTopLevel,
+ /*MultipleArchs*/ true, LinkingOutput, Result);
return;
}
@@ -1617,6 +1766,10 @@
if (JA->getType() == types::TY_dSYM)
BaseInput = InputInfos[0].getFilename();
+ // Append outputs of cuda device jobs to the input list
+ if (CudaDeviceInputInfos.size())
+ InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+
// Determine the place to write output to, if any.
if (JA->getType() == types::TY_Nothing)
Result = InputInfo(A->getType(), BaseInput);
@@ -2022,10 +2175,8 @@
return Target;
}
-const ToolChain &Driver::getToolChain(const ArgList &Args,
- StringRef DarwinArchName) const {
- llvm::Triple Target = computeTargetTriple(DefaultTargetTriple, Args,
- DarwinArchName);
+const ToolChain &Driver::getTargetToolChain(const ArgList &Args,
+ llvm::Triple &Target) const {
ToolChain *&TC = ToolChains[Target.str()];
if (!TC) {
@@ -2089,6 +2240,9 @@
break;
}
break;
+ case llvm::Triple::CUDA:
+ TC = new toolchains::Cuda(*this, Target, Args);
+ break;
default:
// TCE is an OSless target
if (Target.getArchName() == "tce") {
@@ -2119,6 +2273,13 @@
return *TC;
}
+const ToolChain &Driver::getToolChain(const ArgList &Args,
+ StringRef DarwinArchName) const {
+ llvm::Triple Target =
+ computeTargetTriple(DefaultTargetTriple, Args, DarwinArchName);
+ return getTargetToolChain(Args, Target);
+}
+
bool Driver::ShouldUseClangCompiler(const JobAction &JA) const {
// Check if user requested no clang, or clang doesn't understand this type (we
// only handle single inputs for now).
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -151,6 +151,8 @@
case Action::InputClass:
case Action::BindArchClass:
+ case Action::CudaDeviceClass:
+ case Action::CudaHostClass:
case Action::LipoJobClass:
case Action::DsymutilJobClass:
case Action::VerifyDebugInfoJobClass:
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -3420,6 +3420,62 @@
return new tools::dragonfly::Link(*this);
}
+/// Stub for CUDA toolchain. At the moment we don't have assembler or
+/// linker and need toolchain mainly to propagate device-side options
+/// to CC1.
+
+Cuda::Cuda(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
+ : Linux(D, Triple, Args) {}
+
+void Cuda::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+ llvm::opt::ArgStringList &CC1Args) const {
+ Linux::addClangTargetOptions(DriverArgs, CC1Args);
+ CC1Args.push_back("-fcuda-is-device");
+}
+
+llvm::opt::DerivedArgList *
+Cuda::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+ const char *BoundArch) const {
+ DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+ const OptTable &Opts = getDriver().getOpts();
+
+ for (Arg *A : Args) {
+ if (A->getOption().matches(options::OPT_Xarch__)) {
+ // Skip this argument unless the architecture matches BoundArch
+ if (A->getValue(0) != StringRef(BoundArch))
+ continue;
+
+ unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+ unsigned Prev = Index;
+ std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
+
+ // If the argument parsing failed or more than one argument was
+ // consumed, the -Xarch_ argument's parameter tried to consume
+ // extra arguments. Emit an error and ignore.
+ //
+ // We also want to disallow any options which would alter the
+ // driver behavior; that isn't going to work in our model. We
+ // use isDriverOption() as an approximation, although things
+ // like -O4 are going to slip through.
+ if (!XarchArg || Index > Prev + 1) {
+ getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
+ << A->getAsString(Args);
+ continue;
+ } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
+ getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
+ << A->getAsString(Args);
+ continue;
+ }
+ XarchArg->setBaseArg(A);
+ A = XarchArg.release();
+ DAL->AddSynthesizedArg(A);
+ }
+ DAL->append(A);
+ }
+
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+ return DAL;
+}
/// XCore tool chain
XCore::XCore(const Driver &D, const llvm::Triple &Triple,
Index: lib/Driver/ToolChains.h
===================================================================
--- lib/Driver/ToolChains.h
+++ lib/Driver/ToolChains.h
@@ -660,6 +660,18 @@
std::string computeSysRoot() const;
};
+class LLVM_LIBRARY_VISIBILITY Cuda : public Linux {
+public:
+ Cuda(const Driver &D, const llvm::Triple &Triple,
+ const llvm::opt::ArgList &Args);
+
+ llvm::opt::DerivedArgList *
+ TranslateArgs(const llvm::opt::DerivedArgList &Args,
+ const char *BoundArch) const override;
+ void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+ llvm::opt::ArgStringList &CC1Args) const override;
+};
+
class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux {
protected:
GCCVersion GCCLibAndIncVersion;
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -1505,6 +1505,12 @@
return CPUName;
}
+ case llvm::Triple::nvptx:
+ case llvm::Triple::nvptx64:
+ if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+ return A->getValue();
+ return "";
+
case llvm::Triple::ppc:
case llvm::Triple::ppc64:
case llvm::Triple::ppc64le: {
@@ -2559,8 +2565,30 @@
bool IsWindowsCygnus =
getToolChain().getTriple().isWindowsCygwinEnvironment();
bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
+ bool IsCuda = false;
- assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
+ assert(Inputs.size() >= 1 && "Must have at least one input.");
+ InputInfoList BaseInputs; // Inputs[0]
+ InputInfoList CudaInputs; // Inputs[1...]
+ const InputInfo &Input = Inputs[0];
+ BaseInputs.push_back(Input);
+
+ if (Inputs.size() > 1) {
+ // Cuda compilation mode may pass more than one file.
+ // Verify that all additional files were derived from the same source.
+ IsCuda = true;
+ StringRef BaseInput(Input.getBaseInput());
+ for (const auto &it : Inputs) {
+ if (BaseInput != StringRef(it.getBaseInput())) {
+ IsCuda = false;
+ break;
+ }
+ }
+ if (IsCuda)
+ CudaInputs.append(std::next(Inputs.begin()), Inputs.end());
+ }
+
+ assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
// Invoke ourselves in -cc1 mode.
//
@@ -2668,7 +2696,7 @@
// Set the main file name, so that debug info works even with
// -save-temps.
CmdArgs.push_back("-main-file-name");
- CmdArgs.push_back(getBaseInputName(Args, Inputs));
+ CmdArgs.push_back(getBaseInputName(Args, Input));
// Some flags which affect the language (via preprocessor
// defines).
@@ -2696,7 +2724,7 @@
CmdArgs.push_back("-analyzer-checker=deadcode");
- if (types::isCXX(Inputs[0].getType()))
+ if (types::isCXX(Input.getType()))
CmdArgs.push_back("-analyzer-checker=cplusplus");
// Enable the following experimental checkers for testing.
@@ -3224,7 +3252,7 @@
// Explicitly error on some things we know we don't support and can't just
// ignore.
- types::ID InputType = Inputs[0].getType();
+ types::ID InputType = Input.getType();
if (!Args.hasArg(options::OPT_fallow_unsupported)) {
Arg *Unsupported;
if (types::isCXX(InputType) &&
@@ -4575,7 +4603,7 @@
assert(Output.isNothing() && "Invalid output.");
}
- for (const auto &II : Inputs) {
+ for (const auto &II : BaseInputs) {
addDashXForInput(Args, II, CmdArgs);
if (II.isFilename())
@@ -4616,16 +4644,26 @@
const char *SplitDwarfOut;
if (SplitDwarf) {
CmdArgs.push_back("-split-dwarf-file");
- SplitDwarfOut = SplitDebugName(Args, Inputs);
+ SplitDwarfOut = SplitDebugName(Args, BaseInputs);
CmdArgs.push_back(SplitDwarfOut);
}
+ // Include device-side CUDA code
+ if (IsCuda) {
+ for (InputInfoList::const_iterator it = CudaInputs.begin(),
+ ie = CudaInputs.end();
+ it != ie; ++it) {
+ CmdArgs.push_back("-cuda-include-gpucode");
+ CmdArgs.push_back(it->getFilename());
+ }
+ }
+
// Finally add the compile command to the compilation.
if (Args.hasArg(options::OPT__SLASH_fallback) &&
Output.getType() == types::TY_Object &&
(InputType == types::TY_C || InputType == types::TY_CXX)) {
- auto CLCommand =
- getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
+ auto CLCommand = getCLFallback()->GetCommand(C, JA, Output, BaseInputs,
+ Args, LinkingOutput);
C.addCommand(llvm::make_unique<FallbackCommand>(JA, *this, Exec, CmdArgs,
std::move(CLCommand)));
} else {
@@ -5683,9 +5721,13 @@
}
const char *Clang::getBaseInputName(const ArgList &Args,
+ const InputInfo &Input) {
+ return Args.MakeArgString(llvm::sys::path::filename(Input.getBaseInput()));
+}
+
+const char *Clang::getBaseInputName(const ArgList &Args,
const InputInfoList &Inputs) {
- return Args.MakeArgString(
- llvm::sys::path::filename(Inputs[0].getBaseInput()));
+ return getBaseInputName(Args, Inputs[0]);
}
const char *Clang::getBaseInputStem(const ArgList &Args,
Index: lib/Driver/Tools.h
===================================================================
--- lib/Driver/Tools.h
+++ lib/Driver/Tools.h
@@ -41,6 +41,8 @@
public:
static const char *getBaseInputName(const llvm::opt::ArgList &Args,
const InputInfoList &Inputs);
+ static const char *getBaseInputName(const llvm::opt::ArgList &Args,
+ const InputInfo &Input);
static const char *getBaseInputStem(const llvm::opt::ArgList &Args,
const InputInfoList &Inputs);
static const char *getDependencyFileName(const llvm::opt::ArgList &Args,
Index: lib/Driver/Types.cpp
===================================================================
--- lib/Driver/Types.cpp
+++ lib/Driver/Types.cpp
@@ -86,6 +86,7 @@
case TY_C: case TY_PP_C:
case TY_CL:
case TY_CUDA: case TY_PP_CUDA:
+ case TY_CUDA_DEVICE:
case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
case TY_CXX: case TY_PP_CXX:
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
@@ -122,7 +123,7 @@
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
case TY_CXXHeader: case TY_PP_CXXHeader:
case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
- case TY_CUDA: case TY_PP_CUDA:
+ case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
return true;
}
}
@@ -206,10 +207,12 @@
P.push_back(phases::Compile);
P.push_back(phases::Backend);
}
- P.push_back(phases::Assemble);
+ if (Id != TY_CUDA_DEVICE)
+ P.push_back(phases::Assemble);
}
}
- if (!onlyPrecompileType(Id)) {
+
+ if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
P.push_back(phases::Link);
}
assert(0 < P.size() && "Not enough phases in list");
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -639,6 +639,8 @@
Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags,
Opts.SanitizeRecover);
+ Opts.CudaGpuCodeFiles = Args.getAllArgValues(OPT_cuda_include_gpucode);
+
return Success;
}
Index: test/Driver/cuda-options.cu
===================================================================
--- /dev/null
+++ test/Driver/cuda-options.cu
@@ -0,0 +1,108 @@
+// Tests CUDA compilation pipeline construction in Driver.
+
+// Simple compilation case:
+// RUN: %clang -### -nocudainc -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN: | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate device code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Typical compilation + link case:
+// RUN: %clang -### -nocudainc %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side
+// RUN: | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate device code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Then link things.
+// RUN: -check-prefix CUDA-L %s
+
+// Verify that -cuda-no-device disables device-side compilation and linking
+// RUN: %clang -### -nocudainc -fcuda-no-device %s 2>&1 \
+// Make sure we didn't run device-side compilation.
+// RUN: | FileCheck -check-prefix CUDA-ND \
+// Then compile host side and make sure we don't attempt to incorporate GPU code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-NI \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Verify that -cuda-no-host disables host-side compilation and linking
+// RUN: %clang -### -nocudainc -fcuda-no-host %s 2>&1 \
+// Compile device-side to PTX assembly
+// RUN: | FileCheck -check-prefix CUDA-D1 \
+// Make sure there are no host cmpilation or linking.
+// RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s
+
+// Verify that with -S we compile host and device sides to assembly
+// and incorporate device code on the host side.
+// RUN: %clang -### -nocudainc -S -c %s 2>&1 \
+// Compile device-side to PTX assembly
+// RUN: | FileCheck -check-prefix CUDA-D1 \
+// Then compile host side and incorporate GPU code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Verify that --gpu-architecture option passes correct GPU
+// archtecture info to device compilation.
+// RUN: %clang -### -nocudainc --gpu-architecture=sm_35 -c %s 2>&1 \
+// Compile device-side to PTX assembly.
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// Then compile host side and incorporate GPU code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Verify that there is device-side compilation per --gpu-architecture args
+// and that all results are included on the host side.
+// RUN: %clang -### -nocudainc --gpu-architecture=sm_35 --gpu-architecture=sm_30 -c %s 2>&1 \
+// Compile both device-sides to PTX assembly
+// RUN: | FileCheck \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
+// Then compile host side and incorporate both device-side outputs
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Match device-side compilation
+// CUDA-D1: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-D1-SAME: "-fcuda-is-device"
+// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
+// CUDA-D1-SAME: "-o" "[[GPUCODE1:[^"]*]]"
+// CUDA-D1-SAME: "-x" "cuda"
+
+// Match anothe device-side compilation
+// CUDA-D2: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-D2-SAME: "-fcuda-is-device"
+// CUDA-D2-SM30-SAME: "-target-cpu" "sm_30"
+// CUDA-D2-SAME: "-o" "[[GPUCODE2:[^"]*]]"
+// CUDA-D2-SAME: "-x" "cuda"
+
+// Match no device-side compilation
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-SAME-NOT: "-fcuda-is-device"
+
+// Match host-side compilation
+// CUDA-H: "-cc1" "-triple"
+// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "-fcuda-is-device"
+// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
+// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-I1-SAME: "-cuda-include-gpucode" "[[GPUCODE1]]"
+// CUDA-H-I2-SAME: "-cuda-include-gpucode" "[[GPUCODE2]]"
+
+// Match no GPU code inclusion.
+// CUDA-H-NI-NOT: "-cuda-include-gpucode"
+
+// Match no CUDA compilation
+// CUDA-NH-NOT: "-cc1" "-triple"
+// CUDA-NH-SAME-NOT: "-x" "cuda"
+
+// Match linker
+// CUDA-L: "{{.*}}ld{{(.exe)?}}"
+// CUDA-L-SAME: "[[HOSTOBJ]]"
+
+// Match no linker
+// CUDA-NL-NOT: "{{.*}}ld{{(.exe)?}}"
Index: test/Index/attributes-cuda.cu
===================================================================
--- test/Index/attributes-cuda.cu
+++ test/Index/attributes-cuda.cu
@@ -1,5 +1,5 @@
-// RUN: c-index-test -test-load-source all -x cuda %s | FileCheck %s
-
+// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-no-device %s | FileCheck %s
+// RUN: c-index-test -test-load-source all -x cuda -nocudainc -fcuda-no-host %s | FileCheck %s
__attribute__((device)) void f_device();
__attribute__((global)) void f_global();
__attribute__((constant)) int* g_constant;
Index: tools/libclang/CIndex.cpp
===================================================================
--- tools/libclang/CIndex.cpp
+++ tools/libclang/CIndex.cpp
@@ -2979,6 +2979,11 @@
/*AllowPCHWithCompilerErrors=*/true, SkipFunctionBodies,
/*UserFilesAreVolatile=*/true, ForSerialization, &ErrUnit));
+ if (!Unit && !ErrUnit) {
+ PTUI->result = CXError_ASTReadError;
+ return;
+ }
+
if (NumErrors != Diags->getClient()->getNumErrors()) {
// Make sure to check that 'Unit' is non-NULL.
if (CXXIdx->getDisplayDiagnostics())
Index: unittests/ASTMatchers/ASTMatchersTest.h
===================================================================
--- unittests/ASTMatchers/ASTMatchersTest.h
+++ unittests/ASTMatchers/ASTMatchersTest.h
@@ -163,6 +163,7 @@
std::vector<std::string> Args;
Args.push_back("-xcuda");
Args.push_back("-fno-ms-extensions");
+ Args.push_back("-fcuda-no-device");
Args.push_back(CompileArg);
if (!runToolOnCodeWithArgs(Factory->create(),
CudaHeader + Code, Args)) {
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits