https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/85304
>From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes <jeffrey.byr...@amd.com> Date: Tue, 12 Mar 2024 10:22:24 -0700 Subject: [PATCH 1/2] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 17 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 ++++++++++++++++-- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 14 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 3 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 25 ++++ 9 files changed, 202 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61ec8b79bf054d..f7b6a4610bd80a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") -BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 528a13fb275124..4bf71c7535db63 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_grid_size_z: return EmitAMDGPUGridSize(*this, 2); + // scheduling builtins + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: { + return E->getNumArgs() == 3 + ? Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2))}) + : Builder.CreateCall( + CGM.getIntrinsic( + Intrinsic::amdgcn_sched_group_barrier_rule), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3))}); + } + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 8a4533633706b2..e28e0a6987484b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,20 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 10000, -1); } +// CHECK-LABEL: @test_sched_group_barrier_rule +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 -100) +void test_sched_group_barrier_rule() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); + __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); + __builtin_amdgcn_sched_group_barrier(15, 10000, -1, -100); +} + + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 051e603c0819d2..68fe42a8f04d21 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, // matching instructions that will be associated with this sched_group_barrier. // The third parameter is an identifier which is used to describe what other // sched_group_barriers should be synchronized with. -def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, - IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +multiclass SCHED_GROUP_BARRIER_I { + def NAME: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, + IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + def _rule: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoMem, IntrHasSideEffects, + IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +} + + +defm int_amdgcn_sched_group_barrier : SCHED_GROUP_BARRIER_I; // Scheduler optimization hint. // MASK = 0: Small gemm opt diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 57769fe998d1fe..d158659141f795 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -248,6 +248,7 @@ class SchedGroup { static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); while (!SU.Preds.empty()) @@ -399,7 +400,8 @@ void PipelineSolver::reset() { SmallVector<SUnit *, 32> TempCollection = SG.Collection; SG.Collection.clear(); auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { - return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; + return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE; }); if (SchedBarr != TempCollection.end()) SG.Collection.push_back(*SchedBarr); @@ -457,7 +459,8 @@ void PipelineSolver::makePipeline() { << " has: \n"); SUnit *SGBarr = nullptr; for (auto &SU : SG.Collection) { - if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) + if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) SGBarr = SU; LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); } @@ -496,7 +499,6 @@ int PipelineSolver::linkSUnit( int PipelineSolver::addEdges( SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { - // For IsBottomUp, the first SchedGroup in SyncPipeline contains the // instructions that are the ultimate successors in the resultant mutation. // Therefore, in such a configuration, the SchedGroups occurring before the @@ -2337,6 +2339,12 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { ScheduleDAGMI *DAG; + // An array of rule constuctors. These are to be used by + // SCHED_GROUP_BARRIER_RULE with the RuleID argument being + // an index into this array. + std::vector<function_ref<std::shared_ptr<InstructionRule>(unsigned)>> + SchedGroupBarrierRuleCallBacks; + // Organize lists of SchedGroups by their SyncID. SchedGroups / // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added // between then. @@ -2368,6 +2376,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { public: void apply(ScheduleDAGInstrs *DAGInstrs) override; + // Define the rules to be used with sched_group_barrier rules and register + // the constructors. + void addSchedGroupBarrierRules(); + // The order in which the PipelineSolver should process the candidate // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last // created SchedGroup first, and will consider that as the ultimate @@ -2379,7 +2391,9 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; IGroupLPDAGMutation() = default; - IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} + IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) { + addSchedGroupBarrierRules(); + } }; unsigned SchedGroup::NumSchedGroups = 0; @@ -2456,7 +2470,8 @@ int SchedGroup::link(SUnit &SU, bool MakePred, int MissedEdges = 0; for (auto *A : Collection) { SUnit *B = &SU; - if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) + if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) continue; if (MakePred) std::swap(A, B); @@ -2479,7 +2494,8 @@ int SchedGroup::link(SUnit &SU, bool MakePred, void SchedGroup::link(SUnit &SU, bool MakePred) { for (auto *A : Collection) { SUnit *B = &SU; - if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) + if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) continue; if (MakePred) std::swap(A, B); @@ -2578,7 +2594,8 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { if (Opc == AMDGPU::SCHED_BARRIER) { addSchedBarrierEdges(*R); FoundSB = true; - } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { + } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER || + Opc == AMDGPU::SCHED_GROUP_BARRIER_RULE) { initSchedGroupBarrierPipelineStage(R); FoundSB = true; } else if (Opc == AMDGPU::IGLP_OPT) { @@ -2658,21 +2675,96 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { return InvertedMask; } +void IGroupLPDAGMutation::addSchedGroupBarrierRules() { + /// Whether or not the instruction has no true data predecessors + /// with opcode \p Opc. + class NoOpcDataPred : public InstructionRule { + protected: + unsigned Opc; + + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + return !std::any_of( + SU->Preds.begin(), SU->Preds.end(), [this](const SDep &Pred) { + return Pred.getKind() == SDep::Data && + Pred.getSUnit()->getInstr()->getOpcode() == Opc; + }); + } + + NoOpcDataPred(unsigned Opc, const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Opc(Opc) {} + }; + + /// Whether or not the instruction has no write after read predecessors + /// with opcode \p Opc. + class NoOpcWARPred final : public InstructionRule { + protected: + unsigned Opc; + + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + return !std::any_of( + SU->Preds.begin(), SU->Preds.end(), [this](const SDep &Pred) { + return Pred.getKind() == SDep::Anti && + Pred.getSUnit()->getInstr()->getOpcode() == Opc; + }); + } + NoOpcWARPred(unsigned Opc, const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Opc(Opc){}; + }; + + SchedGroupBarrierRuleCallBacks = { + [&](unsigned SGID) { + return std::make_shared<NoOpcWARPred>(AMDGPU::V_CNDMASK_B32_e64, TII, + SGID, false); + }, + [&](unsigned SGID) { + return std::make_shared<NoOpcWARPred>(AMDGPU::V_PERM_B32_e64, TII, SGID, + false); + }, + [&](unsigned SGID) { + return std::make_shared<NoOpcDataPred>(AMDGPU::V_CNDMASK_B32_e64, TII, + SGID, false); + }, + [&](unsigned SGID) { + return std::make_shared<NoOpcDataPred>(AMDGPU::V_PERM_B32_e64, TII, + SGID, false); + }}; +} + void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( std::vector<SUnit>::reverse_iterator RIter) { // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due // to the instruction having side effects. resetEdges(*RIter, DAG); MachineInstr &SGB = *RIter->getInstr(); - assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); + assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE); int32_t SGMask = SGB.getOperand(0).getImm(); int32_t Size = SGB.getOperand(1).getImm(); int32_t SyncID = SGB.getOperand(2).getImm(); + std::optional<int32_t> RuleID = + (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) + ? SGB.getOperand(3).getImm() + : std::optional<int32_t>(std::nullopt); + + // Sanitize the input + if (RuleID && (!SchedGroupBarrierRuleCallBacks.size() || + *RuleID > (int)(SchedGroupBarrierRuleCallBacks.size() - 1))) { + RuleID = std::nullopt; + llvm_unreachable("Bad rule ID!"); + } - auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, + auto SG = &SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, Size, SyncID, DAG, TII); + if (RuleID) + SG->addRule(SchedGroupBarrierRuleCallBacks[*RuleID](SG->getSGID())); - SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); + SG->initSchedGroup(RIter, SyncedInstrs[SG->getSyncID()]); } bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c24d39b9e5fddf..6b25e518de3e80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -247,6 +247,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) { + if (isVerbose()) { + std::string HexString; + raw_string_ostream HexStream(HexString); + HexStream << format_hex(MI->getOperand(0).getImm(), 10, true); + OutStreamer->emitRawComment( + " sched_group_barrier mask(" + HexString + ") size(" + + Twine(MI->getOperand(1).getImm()) + ") SyncID(" + + Twine(MI->getOperand(2).getImm()) + ")" + " Rule(" + + Twine(MI->getOperand(3).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::IGLP_OPT) { if (isVerbose()) { std::string HexString; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3ab788406ecb28..3602746a329115 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -402,6 +402,22 @@ def SCHED_GROUP_BARRIER : SPseudoInstSI< let isMeta = 1; } +def SCHED_GROUP_BARRIER_RULE : SPseudoInstSI< + (outs), + (ins i32imm:$mask, i32imm:$size, i32imm:$syncid, i32imm:$ruleid), + [(int_amdgcn_sched_group_barrier_rule (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid), (i32 timm:$ruleid))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; + let isMeta = 1; +} + + def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { let SchedRW = []; diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 8464cb3d6fc43d..f967ec6202e0dc 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -133,7 +133,8 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { bool HasIGLPInstrs = llvm::any_of(MBB.instrs(), [](MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; + return Opc == AMDGPU::SCHED_GROUP_BARRIER || + Opc == AMDGPU::SCHED_GROUP_BARRIER_RULE || Opc == AMDGPU::IGLP_OPT; }); // Don't cluster with IGLP instructions. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 10f09b6390abae..e64c7e612ad99a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -26,6 +26,30 @@ entry: ret void } +define amdgpu_kernel void @test_sched_group_barrier_rule() #0 { +; GCN-LABEL: test_sched_group_barrier_rule: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2) Rule(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4) Rule(1) +; GCN-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16) Rule(2) +; GCN-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) Rule(3) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_rule: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2) Rule(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4) Rule(1) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16) Rule(2) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) Rule(3) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 1) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 2) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 3) #1 + ret void +} + define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: @@ -1615,6 +1639,7 @@ entry: declare i32 @llvm.amdgcn.workitem.id.x() #2 declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 +declare void @llvm.amdgcn.sched.group.barrier.rule(i32, i32, i32, i32) #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 declare float @llvm.exp.f32(float) #2 >From 4cd37bf60ff373461462d709725885ae92afd6f1 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes <jeffrey.byr...@amd.com> Date: Tue, 23 Apr 2024 12:02:52 -0700 Subject: [PATCH 2/2] variadic builtin arg -> intrinsic mask + add test --- clang/lib/CodeGen/CGBuiltin.cpp | 35 ++-- .../builtins-amdgcn-sched-param-err.cl | 10 ++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 41 +++-- llvm/lib/Target/AMDGPU/SIInstructions.td | 4 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 10 +- .../llvm.amdgcn.sched.group.barrier.rule.ll | 157 ++++++++++++++++++ 8 files changed, 231 insertions(+), 42 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-sched-param-err.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.rule.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4bf71c7535db63..88220df63266b5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18763,19 +18763,28 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, // scheduling builtins case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: { - return E->getNumArgs() == 3 - ? Builder.CreateCall( - CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), - {EmitScalarExpr(E->getArg(0)), - EmitScalarExpr(E->getArg(1)), - EmitScalarExpr(E->getArg(2))}) - : Builder.CreateCall( - CGM.getIntrinsic( - Intrinsic::amdgcn_sched_group_barrier_rule), - {EmitScalarExpr(E->getArg(0)), - EmitScalarExpr(E->getArg(1)), - EmitScalarExpr(E->getArg(2)), - EmitScalarExpr(E->getArg(3))}); + if (E->getNumArgs() == 3) + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), + {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2))}); + + uint64_t Mask = 0; + for (unsigned I = 3; I < E->getNumArgs(); I++) { + auto NextArg = EmitScalarExpr(E->getArg(I)); + auto ArgLiteral = cast<ConstantInt>(NextArg)->getZExtValue(); + if (ArgLiteral > 63) { + CGM.Error(E->getExprLoc(), + getContext().BuiltinInfo.getName(BuiltinID).str() + + " RuleID must be within [0,63]."); + } + Mask |= (uint64_t)1 << ArgLiteral; + } + + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier_rule), + {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), llvm::ConstantInt::get(Int64Ty, Mask)}); } // r600 intrinsics diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-sched-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-sched-param-err.cl new file mode 100644 index 00000000000000..cd30074a729da4 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-sched-param-err.cl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -verify -S -o - %s + +void test_sched_group_barrier_rule() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2, -1); // expected-error {{__builtin_amdgcn_sched_group_barrier RuleID must be within [0,63].}} + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 64); // expected-error {{__builtin_amdgcn_sched_group_barrier RuleID must be within [0,63].}} + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 101); // expected-error {{__builtin_amdgcn_sched_group_barrier RuleID must be within [0,63].}} + __builtin_amdgcn_sched_group_barrier(1, 2, 4, -2147483648); // expected-error {{__builtin_amdgcn_sched_group_barrier RuleID must be within [0,63].}} +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index e28e0a6987484b..73b9b6e1283e32 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -437,16 +437,18 @@ void test_sched_group_barrier() } // CHECK-LABEL: @test_sched_group_barrier_rule -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 -100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i64 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i64 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i64 -9223372036854775808) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, i64 255) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, i64 1) void test_sched_group_barrier_rule() { __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); - __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); - __builtin_amdgcn_sched_group_barrier(15, 10000, -1, -100); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 63); + __builtin_amdgcn_sched_group_barrier(2, 4, 6, 0, 1, 2, 3, 4, 5, 6, 7); + __builtin_amdgcn_sched_group_barrier(2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 68fe42a8f04d21..74e96ea1ff25ba 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -301,7 +301,7 @@ multiclass SCHED_GROUP_BARRIER_I { def NAME: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; - def _rule: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + def _rule: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index d158659141f795..96a43a5772d9c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2342,7 +2342,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { // An array of rule constuctors. These are to be used by // SCHED_GROUP_BARRIER_RULE with the RuleID argument being // an index into this array. - std::vector<function_ref<std::shared_ptr<InstructionRule>(unsigned)>> + std::vector<function_ref<std::shared_ptr<InstructionRule>( + unsigned, const SIInstrInfo *)>> SchedGroupBarrierRuleCallBacks; // Organize lists of SchedGroups by their SyncID. SchedGroups / @@ -2676,6 +2677,7 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { } void IGroupLPDAGMutation::addSchedGroupBarrierRules() { + /// Whether or not the instruction has no true data predecessors /// with opcode \p Opc. class NoOpcDataPred : public InstructionRule { @@ -2718,19 +2720,19 @@ void IGroupLPDAGMutation::addSchedGroupBarrierRules() { }; SchedGroupBarrierRuleCallBacks = { - [&](unsigned SGID) { + [](unsigned SGID, const SIInstrInfo *TII) { return std::make_shared<NoOpcWARPred>(AMDGPU::V_CNDMASK_B32_e64, TII, SGID, false); }, - [&](unsigned SGID) { + [](unsigned SGID, const SIInstrInfo *TII) { return std::make_shared<NoOpcWARPred>(AMDGPU::V_PERM_B32_e64, TII, SGID, false); }, - [&](unsigned SGID) { + [](unsigned SGID, const SIInstrInfo *TII) { return std::make_shared<NoOpcDataPred>(AMDGPU::V_CNDMASK_B32_e64, TII, SGID, false); }, - [&](unsigned SGID) { + [](unsigned SGID, const SIInstrInfo *TII) { return std::make_shared<NoOpcDataPred>(AMDGPU::V_PERM_B32_e64, TII, SGID, false); }}; @@ -2747,23 +2749,32 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( int32_t SGMask = SGB.getOperand(0).getImm(); int32_t Size = SGB.getOperand(1).getImm(); int32_t SyncID = SGB.getOperand(2).getImm(); - std::optional<int32_t> RuleID = + std::optional<uint64_t> RuleMask = (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) ? SGB.getOperand(3).getImm() : std::optional<int32_t>(std::nullopt); - // Sanitize the input - if (RuleID && (!SchedGroupBarrierRuleCallBacks.size() || - *RuleID > (int)(SchedGroupBarrierRuleCallBacks.size() - 1))) { - RuleID = std::nullopt; - llvm_unreachable("Bad rule ID!"); - } - auto SG = &SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, Size, SyncID, DAG, TII); - if (RuleID) - SG->addRule(SchedGroupBarrierRuleCallBacks[*RuleID](SG->getSGID())); + // Process the input mask + if (RuleMask) { + uint64_t TheMask = *RuleMask; + unsigned NextID = 0; + while (TheMask) { + if (!(TheMask & 0x1)) { + TheMask >>= 1; + ++NextID; + continue; + } + if ((!SchedGroupBarrierRuleCallBacks.size() || + NextID > SchedGroupBarrierRuleCallBacks.size() - 1)) + llvm_unreachable("Bad rule ID!"); + SG->addRule(SchedGroupBarrierRuleCallBacks[NextID](SG->getSGID(), TII)); + TheMask >>= 1; + ++NextID; + } + } SG->initSchedGroup(RIter, SyncedInstrs[SG->getSyncID()]); } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3602746a329115..1c6787e34704e9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -404,8 +404,8 @@ def SCHED_GROUP_BARRIER : SPseudoInstSI< def SCHED_GROUP_BARRIER_RULE : SPseudoInstSI< (outs), - (ins i32imm:$mask, i32imm:$size, i32imm:$syncid, i32imm:$ruleid), - [(int_amdgcn_sched_group_barrier_rule (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid), (i32 timm:$ruleid))]> { + (ins i32imm:$mask, i32imm:$size, i32imm:$syncid, i64imm:$rulemask), + [(int_amdgcn_sched_group_barrier_rule (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid), (i64 timm:$rulemask))]> { let SchedRW = []; let hasNoSchedulingInfo = 1; let hasSideEffects = 1; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index e64c7e612ad99a..820bb63075e46b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -43,10 +43,10 @@ define amdgpu_kernel void @test_sched_group_barrier_rule() #0 { ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) Rule(3) ; EXACTCUTOFF-NEXT: s_endpgm entry: - call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) #1 - call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 1) #1 - call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 2) #1 - call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 3) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i64 0) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i64 1) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i64 2) #1 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i64 3) #1 ret void } @@ -1639,7 +1639,7 @@ entry: declare i32 @llvm.amdgcn.workitem.id.x() #2 declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 -declare void @llvm.amdgcn.sched.group.barrier.rule(i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.sched.group.barrier.rule(i32, i32, i32, i64) #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 declare float @llvm.exp.f32(float) #2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.rule.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.rule.ll new file mode 100644 index 00000000000000..0de23d3e69e51c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.rule.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @rule2(ptr addrspace(7) noalias %in, ptr addrspace(3) noalias %out, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3, i32 %val) #0 { +; GCN-LABEL: rule2: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s12, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x48 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mad_u32_u24 v2, v0, 24, s12 +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[2:3], s8, 24, v[2:3] +; GCN-NEXT: v_mad_u64_u32 v[12:13], s[2:3], s9, 24, v[2:3] +; GCN-NEXT: v_mad_u64_u32 v[14:15], s[2:3], s10, 24, v[2:3] +; GCN-NEXT: v_mad_u64_u32 v[16:17], s[2:3], s11, 24, v[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[6:7], v10, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[8:9], v12, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v14, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[4:5], v16, s[4:7], 0 offen +; GCN-NEXT: s_load_dword s13, s[0:1], 0x44 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x58 +; GCN-NEXT: s_lshl_b32 s0, s8, 5 +; GCN-NEXT: s_lshl_b32 s1, s8, 2 +; GCN-NEXT: s_lshl_b32 s3, s9, 5 +; GCN-NEXT: s_lshl_b32 s8, s9, 2 +; GCN-NEXT: s_lshl_b32 s9, s10, 5 +; GCN-NEXT: s_lshl_b32 s10, s10, 2 +; GCN-NEXT: s_lshl_b32 s14, s11, 5 +; GCN-NEXT: s_lshl_b32 s11, s11, 2 +; GCN-NEXT: s_add_i32 s0, s0, s12 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s1, s1, s13 +; GCN-NEXT: s_add_i32 s3, s3, s12 +; GCN-NEXT: s_add_i32 s8, s8, s13 +; GCN-NEXT: s_add_i32 s9, s9, s12 +; GCN-NEXT: s_add_i32 s10, s10, s13 +; GCN-NEXT: s_add_i32 s12, s14, s12 +; GCN-NEXT: s_add_i32 s11, s11, s13 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: s_add_i32 s0, s0, 32 +; GCN-NEXT: s_add_i32 s1, s1, 4 +; GCN-NEXT: s_add_i32 s3, s3, 32 +; GCN-NEXT: s_add_i32 s8, s8, 4 +; GCN-NEXT: s_add_i32 s9, s9, 32 +; GCN-NEXT: s_add_i32 s10, s10, 4 +; GCN-NEXT: s_add_i32 s12, s12, 32 +; GCN-NEXT: s_add_i32 s11, s11, 4 +; GCN-NEXT: s_mov_b32 s13, 0x5040100 +; GCN-NEXT: s_mov_b32 s14, 0x7060302 +; GCN-NEXT: .LBB0_1: ; %bb.1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_perm_b32 v11, v9, v8, s13 +; GCN-NEXT: v_perm_b32 v10, v7, v6, s13 +; GCN-NEXT: v_perm_b32 v9, v9, v8, s14 +; GCN-NEXT: v_perm_b32 v8, v7, v6, s14 +; GCN-NEXT: v_add_u32_e32 v1, s1, v0 +; GCN-NEXT: v_add_u32_e32 v6, s8, v0 +; GCN-NEXT: v_add_u32_e32 v7, s10, v0 +; GCN-NEXT: v_add_u32_e32 v12, s11, v0 +; GCN-NEXT: ds_write_b64 v1, v[10:11] +; GCN-NEXT: ds_write_b64 v6, v[8:9] +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: ds_write_b64 v7, v[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_write_b64 v12, v[4:5] +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v10, s3, v0 +; GCN-NEXT: v_add_u32_e32 v11, s9, v0 +; GCN-NEXT: v_add_u32_e32 v12, s12, v0 +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v11, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[4:5], v12, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[6:7], v1, s[4:7], 0 offen +; GCN-NEXT: buffer_load_dwordx2 v[8:9], v10, s[4:7], 0 offen +; GCN-NEXT: s_add_i32 s2, s2, 1 +; GCN-NEXT: s_add_i32 s0, s0, 32 +; GCN-NEXT: s_add_i32 s1, s1, 4 +; GCN-NEXT: s_add_i32 s3, s3, 32 +; GCN-NEXT: s_add_i32 s8, s8, 4 +; GCN-NEXT: s_add_i32 s9, s9, 32 +; GCN-NEXT: s_add_i32 s10, s10, 4 +; GCN-NEXT: s_add_i32 s12, s12, 32 +; GCN-NEXT: s_add_i32 s11, s11, 4 +; GCN-NEXT: s_cmp_lt_u32 s2, 15 +; GCN-NEXT: ; kill: killed $vgpr12 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr11 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(1) Rule(2) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(1) Rule(2) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(1) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(1) +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %bb.2 +; GCN-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 + %gepPtr = getelementptr ptr addrspace(7), ptr addrspace(7) %in, i32 %tid + %outPtr = getelementptr ptr addrspace(7), ptr addrspace(3) %out, i32 %tid + %gep0 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx0 + %gep1 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx1 + %gep2 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx2 + %gep3 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx3 + %load0 = load <4 x i16>, ptr addrspace(7) %gep0 + %load1 = load <4 x i16>, ptr addrspace(7) %gep1 + %load2 = load <4 x i16>, ptr addrspace(7) %gep2 + %load3 = load <4 x i16>, ptr addrspace(7) %gep3 + br label %bb.1 + +bb.1: + %p0 = phi <4 x i16> [ %load0, %entry ], [ %load4, %bb.1 ] + %p1 = phi <4 x i16> [ %load1, %entry ], [ %load5, %bb.1 ] + %p2 = phi <4 x i16> [ %load2, %entry ], [ %load6, %bb.1 ] + %p3 = phi <4 x i16> [ %load3, %entry ], [ %load7, %bb.1 ] + %val1 = phi i32 [%val, %entry], [%val2, %bb.1] + %idx8 = phi i32 [%idx0, %entry], [%idx4, %bb.1] + %idx9 = phi i32 [%idx1, %entry], [%idx5, %bb.1] + %idx10 = phi i32 [%idx2, %entry], [%idx6, %bb.1] + %idx11 = phi i32 [%idx3, %entry], [%idx7, %bb.1] + %shuffle1 = shufflevector <4 x i16> %p0, <4 x i16> %p1, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %shuffle2 = shufflevector <4 x i16> %p0, <4 x i16> %p1, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %idx4 = add i32 %idx8, 1 + %idx5 = add i32 %idx9, 1 + %idx6 = add i32 %idx10, 1 + %idx7 = add i32 %idx11, 1 + %out0 = getelementptr ptr addrspace(3), ptr addrspace(3) %outPtr, i32 %idx4 + %out1 = getelementptr ptr addrspace(3), ptr addrspace(3) %outPtr, i32 %idx5 + %out2 = getelementptr ptr addrspace(3), ptr addrspace(3) %outPtr, i32 %idx6 + %out3 = getelementptr ptr addrspace(3), ptr addrspace(3) %outPtr, i32 %idx7 + store <4 x i16> %shuffle1, ptr addrspace(3) %out0 + store <4 x i16> %shuffle2, ptr addrspace(3) %out1 + store <4 x i16> %p2, ptr addrspace(3) %out2 + store <4 x i16> %p3, ptr addrspace(3) %out3 + %gep4 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx4 + %gep5 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx5 + %gep6 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx6 + %gep7 = getelementptr ptr addrspace(7), ptr addrspace(7) %gepPtr, i32 %idx7 + %load4 = load <4 x i16>, ptr addrspace(7) %gep4 + %load5 = load <4 x i16>, ptr addrspace(7) %gep5 + %load6 = load <4 x i16>, ptr addrspace(7) %gep6 + %load7 = load <4 x i16>, ptr addrspace(7) %gep7 + call void @llvm.amdgcn.sched.group.barrier.rule(i32 32, i32 1, i32 1, i64 2) + call void @llvm.amdgcn.sched.group.barrier.rule(i32 32, i32 1, i32 1, i64 2) + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 1) + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 1) + %val2 = add i32 %val1, 1 + %cmp = icmp ult i32 %val2, 15 + br i1 %cmp, label %bb.1, label %bb.2 + +bb.2: + ret void +} + +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 +declare void @llvm.amdgcn.sched.group.barrier.rule(i32, i32, i32, i64) #1 + + _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits