https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/67000
>From e801022968ea4a42632fbcf4c5ba03e67a32c7ae Mon Sep 17 00:00:00 2001 From: Dominik Adamski <dominik.adam...@amd.com> Date: Mon, 11 Sep 2023 05:31:37 -0400 Subject: [PATCH] [OpenMP][OMPIRBuilder] Add support to omp target parallel Added support for LLVM IR code generation which is used for handling omp target parallel code. The call for __kmpc_parallel_51 is generated and the parallel region is outlined to separate function. The proper setup of kmpc_target_init mode is not included in the commit. It is assumed that the SPMD mode for target init is properly set by other codegen functions. --- clang/test/OpenMP/cancel_codegen.cpp | 20 +- clang/test/OpenMP/parallel_codegen.cpp | 4 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 314 +++++++++++++----- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 1 + .../Frontend/OpenMPIRBuilderTest.cpp | 139 +++++++- 5 files changed, 378 insertions(+), 100 deletions(-) diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp index 53580e0c2b0293f..03024cf331b2717 100644 --- a/clang/test/OpenMP/cancel_codegen.cpp +++ b/clang/test/OpenMP/cancel_codegen.cpp @@ -1026,25 +1026,25 @@ for (int i = 0; i < argc; ++i) { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias ![[NOALIAS0:[0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias ![[NOALIAS0]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias ![[NOALIAS0]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias ![[NOALIAS0]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias ![[NOALIAS0]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias ![[NOALIAS0]] +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias ![[NOALIAS0]] // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12:[0-9]+]]) // CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4) // CHECK3-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0 // CHECK3-NEXT: br i1 [[TMP10]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]] // CHECK3: .cancel.exit.i: -// CHECK3-NEXT: store i32 1, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 1, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1:[0-9]+]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: .cancel.continue.i: -// CHECK3-NEXT: store i32 0, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 0, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK3: .omp_outlined..exit: -// CHECK3-NEXT: [[CLEANUP_DEST_I:%.*]] = load i32, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[CLEANUP_DEST_I:%.*]] = load i32, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1]] // CHECK3-NEXT: ret i32 0 // // diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp index 5c98761be0808ef..d545b4a9d9fa887 100644 --- a/clang/test/OpenMP/parallel_codegen.cpp +++ b/clang/test/OpenMP/parallel_codegen.cpp @@ -812,7 +812,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par -// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { +// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: omp.par.entry: // CHECK3-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8 @@ -956,7 +956,7 @@ int main (int argc, char **argv) { // // // CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par -// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG57:![0-9]+]] { +// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG57:![0-9]+]] { // CHECK4-NEXT: omp.par.entry: // CHECK4-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 // CHECK4-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8 diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5b24e9fe2e0c5bd..5a305931bf23ea3 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -638,6 +638,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) { Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); + // If we generate code for the target device, we need to allocate + // struct for aggregate params in the device default alloca address space. + // OpenMP runtime requires that the params of the extracted functions are + // passed as zero address space pointers. This flag ensures that + // CodeExtractor generates correct code for extracted functions + // which are used by OpenMP runtime. + bool ArgsInZeroAddressSpace = Config.isTargetDevice(); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, /* AggregateArgs */ true, /* BlockFrequencyInfo */ nullptr, @@ -646,7 +653,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) { /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocaBlock*/ OI.OuterAllocaBB, - /* Suffix */ ".omp_par"); + /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() @@ -1126,6 +1133,185 @@ void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); } +// Callback used to create OpenMP runtime calls to support +// omp parallel clause for the device. +// We need to use this callback to replace call to the OutlinedFn in OuterFn +// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51) +static void +targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, + Function *OuterFn, Value *Ident, Value *IfCondition, + Value *NumThreads, Instruction *PrivTID, + AllocaInst *PrivTIDAddr, Value *ThreadID, + const SmallVector<Instruction *, 4> &ToBeDeleted) { + // Add some known attributes. + Module &M = OMPIRBuilder->M; + IRBuilder<> &Builder = OMPIRBuilder->Builder; + OutlinedFn.addParamAttr(0, Attribute::NoAlias); + OutlinedFn.addParamAttr(1, Attribute::NoAlias); + OutlinedFn.addParamAttr(0, Attribute::NoUndef); + OutlinedFn.addParamAttr(1, Attribute::NoUndef); + OutlinedFn.addFnAttr(Attribute::NoUnwind); + + assert(OutlinedFn.arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + assert(CI && "Expected call instruction to outlined function"); + CI->getParent()->setName("omp_parallel"); + // Replace direct call to the outlined function by the call to + // __kmpc_parallel_51 + Builder.SetInsertPoint(CI); + + // Build call __kmpc_parallel_51 + auto PtrTy = Type::getInt8PtrTy(M.getContext()); + Value *Void = ConstantPointerNull::get(PtrTy); + // Add alloca for kernel args. Put this instruction at the beginning + // of the function. + OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP(); + Builder.SetInsertPoint(&OuterFn->front(), + OuterFn->front().getFirstInsertionPt()); + AllocaInst *ArgsAlloca = + Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars)); + Value *Args = + Builder.CreatePointerCast(ArgsAlloca, Type::getInt8PtrTy(M.getContext())); + Builder.restoreIP(CurrentIP); + // Store captured vars which are used by kmpc_parallel_51 + if (NumCapturedVars) { + for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) { + Value *V = *(CI->arg_begin() + 2 + Idx); + Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64( + ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx); + Builder.CreateStore(V, StoreAddress); + } + } + Value *Cond = IfCondition ? Builder.CreateSExtOrTrunc( + IfCondition, Type::getInt32Ty(M.getContext())) + : Builder.getInt32(1); + Value *Parallel51CallArgs[] = { + /* identifier*/ Ident, + /* global thread num*/ ThreadID, + /* if expression */ Cond, NumThreads ? NumThreads : Builder.getInt32(-1), + /* Proc bind */ Builder.getInt32(-1), + /* outlined function */ + Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr), Void, + Args, Builder.getInt64(NumCapturedVars)}; + + SmallVector<Value *, 16> RealArgs; + RealArgs.append(std::begin(Parallel51CallArgs), std::end(Parallel51CallArgs)); + FunctionCallee RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51); + + Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + + // Initialize the local TID stack location with the argument value. + Builder.SetInsertPoint(PrivTID); + Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); + Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), + PrivTIDAddr); + + // Remove redundant call to the outlined function. + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) { + I->eraseFromParent(); + } +} + +// Callback used to create OpenMP runtime calls to support +// omp parallel clause for the host. +// We need to use this callback to replace call to the OutlinedFn in OuterFn +// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if]) +static void +hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, + Function *OuterFn, Value *Ident, Value *IfCondition, + Instruction *PrivTID, AllocaInst *PrivTIDAddr, + const SmallVector<Instruction *, 4> &ToBeDeleted) { + Module &M = OMPIRBuilder->M; + IRBuilder<> &Builder = OMPIRBuilder->Builder; + FunctionCallee RTLFn; + if (IfCondition) { + RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); + } else { + RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); + } + if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) { + if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) { + llvm::LLVMContext &Ctx = F->getContext(); + MDBuilder MDB(Ctx); + // Annotate the callback behavior of the __kmpc_fork_call: + // - The callback callee is argument number 2 (microtask). + // - The first two arguments of the callback callee are unknown (-1). + // - All variadic arguments to the __kmpc_fork_call are passed to the + // callback callee. + F->addMetadata( + llvm::LLVMContext::MD_callback, + *llvm::MDNode::get( + Ctx, {MDB.createCallbackEncoding(2, {-1, -1}, + /* VarArgsArePassed */ true)})); + } + } + // Add some known attributes. + OutlinedFn.addParamAttr(0, Attribute::NoAlias); + OutlinedFn.addParamAttr(1, Attribute::NoAlias); + OutlinedFn.addFnAttr(Attribute::NoUnwind); + + assert(OutlinedFn.arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + CI->getParent()->setName("omp_parallel"); + Builder.SetInsertPoint(CI); + + // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); + Value *ForkCallArgs[] = { + Ident, Builder.getInt32(NumCapturedVars), + Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)}; + + SmallVector<Value *, 16> RealArgs; + RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); + if (IfCondition) { + Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, + Type::getInt32Ty(M.getContext())); + RealArgs.push_back(Cond); + } + RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); + + // __kmpc_fork_call_if always expects a void ptr as the last argument + // If there are no arguments, pass a null pointer. + auto PtrTy = Type::getInt8PtrTy(M.getContext()); + if (IfCondition && NumCapturedVars == 0) { + Value *Void = ConstantPointerNull::get(PtrTy); + RealArgs.push_back(Void); + } + if (IfCondition && RealArgs.back()->getType() != PtrTy) + RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); + + Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With fork_call placed: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + + // Initialize the local TID stack location with the argument value. + Builder.SetInsertPoint(PrivTID); + Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); + Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), + PrivTIDAddr); + + // Remove redundant call to the outlined function. + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) { + I->eraseFromParent(); + } +} + IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( const LocationDescription &Loc, InsertPointTy OuterAllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, @@ -1140,6 +1326,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadID = getOrCreateThreadID(Ident); + // If we generate code for the target device, we need to allocate + // struct for aggregate params in the device default alloca address space. + // OpenMP runtime requires that the params of the extracted functions are + // passed as zero address space pointers. This flag ensures that extracted + // function arguments are declared in zero address space + bool ArgsInZeroAddressSpace = Config.isTargetDevice(); if (NumThreads) { // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) @@ -1173,13 +1365,28 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( // Change the location to the outer alloca insertion point to create and // initialize the allocas we pass into the parallel region. Builder.restoreIP(OuterAllocaIP); - AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); - AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); + AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); + AllocaInst *ZeroAddrAlloca = + Builder.CreateAlloca(Int32, nullptr, "zero.addr"); + Instruction *TIDAddr = TIDAddrAlloca; + Instruction *ZeroAddr = ZeroAddrAlloca; + if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) { + // Add additional casts to enforce pointers in zero address space + TIDAddr = new AddrSpaceCastInst( + TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast"); + TIDAddr->insertAfter(TIDAddrAlloca); + ToBeDeleted.push_back(TIDAddr); + ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca, + PointerType ::get(M.getContext(), 0), + "zero.addr.ascast"); + ZeroAddr->insertAfter(ZeroAddrAlloca); + ToBeDeleted.push_back(ZeroAddr); + } // We only need TIDAddr and ZeroAddr for modeling purposes to get the // associated arguments in the outlined function, so we delete them later. - ToBeDeleted.push_back(TIDAddr); - ToBeDeleted.push_back(ZeroAddr); + ToBeDeleted.push_back(TIDAddrAlloca); + ToBeDeleted.push_back(ZeroAddrAlloca); // Create an artificial insertion point that will also ensure the blocks we // are about to split are not degenerated. @@ -1247,87 +1454,24 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( BodyGenCB(InnerAllocaIP, CodeGenIP); LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); - FunctionCallee RTLFn; - if (IfCondition) - RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); - else - RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); - - if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) { - if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) { - llvm::LLVMContext &Ctx = F->getContext(); - MDBuilder MDB(Ctx); - // Annotate the callback behavior of the __kmpc_fork_call: - // - The callback callee is argument number 2 (microtask). - // - The first two arguments of the callback callee are unknown (-1). - // - All variadic arguments to the __kmpc_fork_call are passed to the - // callback callee. - F->addMetadata( - llvm::LLVMContext::MD_callback, - *llvm::MDNode::get( - Ctx, {MDB.createCallbackEncoding(2, {-1, -1}, - /* VarArgsArePassed */ true)})); - } - } OutlineInfo OI; - OI.PostOutlineCB = [=](Function &OutlinedFn) { - // Add some known attributes. - OutlinedFn.addParamAttr(0, Attribute::NoAlias); - OutlinedFn.addParamAttr(1, Attribute::NoAlias); - OutlinedFn.addFnAttr(Attribute::NoUnwind); - OutlinedFn.addFnAttr(Attribute::NoRecurse); - - assert(OutlinedFn.arg_size() >= 2 && - "Expected at least tid and bounded tid as arguments"); - unsigned NumCapturedVars = - OutlinedFn.arg_size() - /* tid & bounded tid */ 2; - - CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); - CI->getParent()->setName("omp_parallel"); - Builder.SetInsertPoint(CI); - - // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); - Value *ForkCallArgs[] = { - Ident, Builder.getInt32(NumCapturedVars), - Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)}; - - SmallVector<Value *, 16> RealArgs; - RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); - if (IfCondition) { - Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, - Type::getInt32Ty(M.getContext())); - RealArgs.push_back(Cond); - } - RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); - - // __kmpc_fork_call_if always expects a void ptr as the last argument - // If there are no arguments, pass a null pointer. - auto PtrTy = Type::getInt8PtrTy(M.getContext()); - if (IfCondition && NumCapturedVars == 0) { - llvm::Value *Void = ConstantPointerNull::get(PtrTy); - RealArgs.push_back(Void); - } - if (IfCondition && RealArgs.back()->getType() != PtrTy) - RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); - - Builder.CreateCall(RTLFn, RealArgs); - - LLVM_DEBUG(dbgs() << "With fork_call placed: " - << *Builder.GetInsertBlock()->getParent() << "\n"); - - InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end()); - - // Initialize the local TID stack location with the argument value. - Builder.SetInsertPoint(PrivTID); - Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); - Builder.CreateStore(Builder.CreateLoad(Int32, OutlinedAI), PrivTIDAddr); - - CI->eraseFromParent(); - - for (Instruction *I : ToBeDeleted) - I->eraseFromParent(); - }; + if (Config.isTargetDevice()) { + // Generate OpenMP target specific runtime call + OI.PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { + targetParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, + NumThreads, PrivTID, PrivTIDAddr, ThreadID, + ToBeDeletedVec); + }; + } else { + // Generate OpenMP host runtime call + OI.PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { + hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, + PrivTID, PrivTIDAddr, ToBeDeletedVec); + }; + } // Adjust the finalization stack, verify the adjustment, and call the // finalize function a last time to finalize values between the pre-fini @@ -1367,7 +1511,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocationBlock */ OuterAllocaBlock, - /* Suffix */ ".omp_par"); + /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); // Find inputs to, outputs from the code region. BasicBlock *CommonExit = nullptr; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index e50e74ea6c0d5aa..d6763fe79c24da2 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -274,6 +274,7 @@ struct OMPInformationCache : public InformationCache { : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M), OpenMPPostLink(OpenMPPostLink) { + OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M); OMPBuilder.initialize(); initializeRuntimeFunctions(M); initializeInternalControlVars(); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 97cfc339675f657..b0793a034c07614 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -591,9 +591,124 @@ TEST_F(OpenMPIRBuilderTest, DbgLoc) { EXPECT_EQ(SrcSrc->getAsCString(), ";/src/test.dbg;foo;3;7;;"); } +TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + std::string oldDLStr = M->getDataLayoutStr(); + M->setDataLayout( + "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:" + "256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:" + "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"); + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = true; + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + BasicBlock *EnterBB = BasicBlock::Create(Ctx, "parallel.enter", F); + Builder.CreateBr(EnterBB); + Builder.SetInsertPoint(EnterBB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + Loc = OMPBuilder.createTargetInit(Loc, true); + + AllocaInst *PrivAI = nullptr; + + unsigned NumBodiesGenerated = 0; + unsigned NumPrivatizedVars = 0; + unsigned NumFinalizationPoints = 0; + + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + ++NumBodiesGenerated; + + Builder.restoreIP(AllocaIP); + PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); + Builder.CreateStore(F->arg_begin(), PrivAI); + + Builder.restoreIP(CodeGenIP); + Value *PrivLoad = + Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use"); + Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad); + Instruction *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(), + &ThenTerm, &ElseTerm); + }; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &Orig, Value &Inner, + Value *&ReplacementValue) -> InsertPointTy { + ++NumPrivatizedVars; + + if (!isa<AllocaInst>(Orig)) { + EXPECT_EQ(&Orig, F->arg_begin()); + ReplacementValue = &Inner; + return CodeGenIP; + } + + // Since the original value is an allocation, it has a pointer type and + // therefore no additional wrapping should happen. + EXPECT_EQ(&Orig, &Inner); + + // Trivial copy (=firstprivate). + Builder.restoreIP(AllocaIP); + Type *VTy = ReplacementValue->getType(); + Value *V = Builder.CreateLoad(VTy, &Inner, Orig.getName() + ".reload"); + ReplacementValue = Builder.CreateAlloca(VTy, 0, Orig.getName() + ".copy"); + Builder.restoreIP(CodeGenIP); + Builder.CreateStore(V, ReplacementValue); + return CodeGenIP; + }; + + auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; }; + + IRBuilder<>::InsertPoint AllocaIP(&F->getEntryBlock(), + F->getEntryBlock().getFirstInsertionPt()); + IRBuilder<>::InsertPoint AfterIP = + OMPBuilder.createParallel(Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, + nullptr, nullptr, OMP_PROC_BIND_default, false); + + EXPECT_EQ(NumBodiesGenerated, 1U); + EXPECT_EQ(NumPrivatizedVars, 1U); + EXPECT_EQ(NumFinalizationPoints, 1U); + + Builder.restoreIP(AfterIP); + OMPBuilder.createTargetDeinit(Builder); + Builder.CreateRetVoid(); + + OMPBuilder.finalize(); + Function *OutlinedFn = PrivAI->getFunction(); + EXPECT_FALSE(verifyModule(*M, &errs())); + EXPECT_NE(OutlinedFn, F); + EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind)); + EXPECT_TRUE(OutlinedFn->hasParamAttribute(0, Attribute::NoAlias)); + EXPECT_TRUE(OutlinedFn->hasParamAttribute(1, Attribute::NoAlias)); + + EXPECT_TRUE(OutlinedFn->hasInternalLinkage()); + EXPECT_EQ(OutlinedFn->arg_size(), 3U); + // Make sure that arguments are pointers in 0 address address space + EXPECT_EQ(OutlinedFn->getArg(0)->getType(), + PointerType::get(M->getContext(), 0)); + EXPECT_EQ(OutlinedFn->getArg(1)->getType(), + PointerType::get(M->getContext(), 0)); + EXPECT_EQ(OutlinedFn->getArg(2)->getType(), + PointerType::get(M->getContext(), 0)); + EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent()); + EXPECT_EQ(OutlinedFn->getNumUses(), 1U); + User *Usr = OutlinedFn->user_back(); + ASSERT_TRUE(isa<CallInst>(Usr)); + CallInst *Parallel51CI = dyn_cast<CallInst>(Usr); + ASSERT_NE(Parallel51CI, nullptr); + + EXPECT_EQ(Parallel51CI->getCalledFunction()->getName(), "__kmpc_parallel_51"); + EXPECT_EQ(Parallel51CI->arg_size(), 9U); + EXPECT_EQ(Parallel51CI->getArgOperand(5), OutlinedFn); + EXPECT_TRUE( + isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts())); + EXPECT_EQ(Parallel51CI, Usr); + M->setDataLayout(oldDLStr); +} + TEST_F(OpenMPIRBuilderTest, ParallelSimple) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -671,7 +786,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) { EXPECT_NE(F, OutlinedFn); EXPECT_FALSE(verifyModule(*M, &errs())); EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind)); - EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoRecurse)); EXPECT_TRUE(OutlinedFn->hasParamAttribute(0, Attribute::NoAlias)); EXPECT_TRUE(OutlinedFn->hasParamAttribute(1, Attribute::NoAlias)); @@ -699,6 +813,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) { TEST_F(OpenMPIRBuilderTest, ParallelNested) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -768,7 +883,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { continue; EXPECT_FALSE(verifyModule(*M, &errs())); EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoUnwind)); - EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoRecurse)); EXPECT_TRUE(OutlinedFn.hasParamAttribute(0, Attribute::NoAlias)); EXPECT_TRUE(OutlinedFn.hasParamAttribute(1, Attribute::NoAlias)); @@ -793,6 +907,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -872,7 +987,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { continue; EXPECT_FALSE(verifyModule(*M, &errs())); EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoUnwind)); - EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoRecurse)); EXPECT_TRUE(OutlinedFn.hasParamAttribute(0, Attribute::NoAlias)); EXPECT_TRUE(OutlinedFn.hasParamAttribute(1, Attribute::NoAlias)); @@ -902,6 +1016,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { TEST_F(OpenMPIRBuilderTest, ParallelIfCond) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -1006,6 +1121,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) { TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -1119,6 +1235,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) { TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) { OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -4004,6 +4121,7 @@ TEST_F(OpenMPIRBuilderTest, OMPAtomicCompareCapture) { TEST_F(OpenMPIRBuilderTest, CreateTeams) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -4079,6 +4197,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeams) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4129,6 +4248,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4180,6 +4300,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4234,6 +4355,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4293,6 +4415,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4351,6 +4474,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) { TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfConditionAndNumTeams) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> &Builder = OMPBuilder.Builder; @@ -4548,6 +4672,7 @@ xorAtomicReduction(OpenMPIRBuilder::InsertPointTy IP, Type *Ty, Value *LHS, TEST_F(OpenMPIRBuilderTest, CreateReductions) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -4780,6 +4905,7 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -5796,6 +5922,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { TEST_F(OpenMPIRBuilderTest, CreateTask) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -5924,6 +6051,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) { TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -5954,6 +6082,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) { TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -5983,6 +6112,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) { TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -6056,6 +6186,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) { TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -6109,6 +6240,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) { TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); @@ -6269,6 +6401,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) { TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits