jlebar created this revision. jlebar added a reviewer: rnk. jlebar added subscribers: tra, echristo, jhen, cfe-commits.
This is necessary to prevent llvm from generating stacksave intrinsics around this alloca. NVVM doesn't have a stack, and we don't handle said intrinsics. I'm not sure if appending the alloca to the beginning of the entry block is right. Adding it to the end would make more sense to me, but then I'm not sure how to ensure I'm not clobbering the terminator (except by always assuming the BB is nonempty and inserting right before BB.back()?). http://reviews.llvm.org/D16664 Files: lib/CodeGen/CGCUDABuiltin.cpp test/CodeGenCUDA/printf.cu Index: test/CodeGenCUDA/printf.cu =================================================================== --- test/CodeGenCUDA/printf.cu +++ test/CodeGenCUDA/printf.cu @@ -51,3 +51,14 @@ // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} printf("hello, world!"); } + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +__device__ bool foo(); +__device__ void CheckAllocaIsInEntryBlock() { + // CHECK: alloca i8, i32 4 align 4 + // CHECK: call @_Z3foov() + if (foo()) { + printf("%d", 42); + } +} Index: lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- lib/CodeGen/CGCUDABuiltin.cpp +++ lib/CodeGen/CGCUDABuiltin.cpp @@ -102,9 +102,15 @@ // If there are no args, pass a null pointer to vprintf. BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); } else { - BufferPtr = Builder.Insert(new llvm::AllocaInst( + // Insert our alloca not into the current BB, but into the function's entry + // block. This is important because nvvm doesn't support alloca -- if we + // put the alloca anywhere else, llvm may eventually output + // stacksave/stackrestore intrinsics, which cause ptxas to choke. + auto *Alloca = new llvm::AllocaInst( llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), - BufAlign, "printf_arg_buf")); + BufAlign, "printf_arg_buf"); + CurFn->getEntryBlock().getInstList().push_front(Alloca); + BufferPtr = Alloca; unsigned Offset = 0; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
Index: test/CodeGenCUDA/printf.cu =================================================================== --- test/CodeGenCUDA/printf.cu +++ test/CodeGenCUDA/printf.cu @@ -51,3 +51,14 @@ // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} printf("hello, world!"); } + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +__device__ bool foo(); +__device__ void CheckAllocaIsInEntryBlock() { + // CHECK: alloca i8, i32 4 align 4 + // CHECK: call @_Z3foov() + if (foo()) { + printf("%d", 42); + } +} Index: lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- lib/CodeGen/CGCUDABuiltin.cpp +++ lib/CodeGen/CGCUDABuiltin.cpp @@ -102,9 +102,15 @@ // If there are no args, pass a null pointer to vprintf. BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); } else { - BufferPtr = Builder.Insert(new llvm::AllocaInst( + // Insert our alloca not into the current BB, but into the function's entry + // block. This is important because nvvm doesn't support alloca -- if we + // put the alloca anywhere else, llvm may eventually output + // stacksave/stackrestore intrinsics, which cause ptxas to choke. + auto *Alloca = new llvm::AllocaInst( llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), - BufAlign, "printf_arg_buf")); + BufAlign, "printf_arg_buf"); + CurFn->getEntryBlock().getInstList().push_front(Alloca); + BufferPtr = Alloca; unsigned Offset = 0; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits