changeset 7d4d424c9f17 in /z/repo/gem5 details: http://repo.gem5.org/gem5?cmd=changeset;node=7d4d424c9f17 description: gpu-compute: support in-order data delivery in GM pipe
this patch adds an ordered response buffer to the GM pipeline to ensure in-order data delivery. the buffer is implemented as a stl ordered map, which sorts the request in program order by using their sequence ID. when requests return to the GM pipeline they are marked as done. only the oldest request may be serviced from the ordered buffer, and only if is marked as done. the FIFO response buffers are kept and used in OoO delivery mode diffstat: configs/example/apu_se.py | 8 +- src/arch/hsail/insts/decl.hh | 2 +- src/arch/hsail/insts/mem_impl.hh | 16 +- src/arch/hsail/insts/pseudo_inst.cc | 6 +- src/gpu-compute/GPU.py | 2 + src/gpu-compute/compute_unit.cc | 12 +-- src/gpu-compute/global_memory_pipeline.cc | 124 +++++++++++++++++++++++++---- src/gpu-compute/global_memory_pipeline.hh | 49 +++++++++++- 8 files changed, 173 insertions(+), 46 deletions(-) diffs (truncated from 430 to 300 lines): diff -r c7453f485a5f -r 7d4d424c9f17 configs/example/apu_se.py --- a/configs/example/apu_se.py Wed Oct 26 22:47:49 2016 -0400 +++ b/configs/example/apu_se.py Wed Oct 26 22:48:28 2016 -0400 @@ -153,7 +153,9 @@ help = 'fast forward using kvm until the m5_switchcpu' ' pseudo-op is encountered, then switch cpus. subsequent' ' m5_switchcpu pseudo-ops will toggle back and forth') - +parser.add_option('--outOfOrderDataDelivery', action='store_true', + default=False, help='enable OoO data delivery in the GM' + ' pipeline') Ruby.define_options(parser) @@ -248,7 +250,9 @@ localDataStore = \ LdsState(banks = options.numLdsBanks, bankConflictPenalty = \ - options.ldsBankConflictPenalty))) + options.ldsBankConflictPenalty), + out_of_order_data_delivery = + options.outOfOrderDataDelivery)) wavefronts = [] vrfs = [] for j in xrange(options.simds_per_cu): diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/decl.hh --- a/src/arch/hsail/insts/decl.hh Wed Oct 26 22:47:49 2016 -0400 +++ b/src/arch/hsail/insts/decl.hh Wed Oct 26 22:48:28 2016 -0400 @@ -1082,7 +1082,7 @@ gpuDynInst->useContinuation = false; GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); - gmp->getGMReqFIFO().push(gpuDynInst); + gmp->issueRequest(gpuDynInst); w->wrGmReqsInPipe--; w->rdGmReqsInPipe--; diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/mem_impl.hh --- a/src/arch/hsail/insts/mem_impl.hh Wed Oct 26 22:47:49 2016 -0400 +++ b/src/arch/hsail/insts/mem_impl.hh Wed Oct 26 22:48:28 2016 -0400 @@ -263,7 +263,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -288,7 +288,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -312,7 +312,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -330,7 +330,7 @@ } } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -440,7 +440,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -460,7 +460,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -486,7 +486,7 @@ } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -591,7 +591,7 @@ m->latency.set(w->computeUnit->shader->ticks(64)); m->pipeId = GLBMEM_PIPE; - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/pseudo_inst.cc --- a/src/arch/hsail/insts/pseudo_inst.cc Wed Oct 26 22:47:49 2016 -0400 +++ b/src/arch/hsail/insts/pseudo_inst.cc Wed Oct 26 22:48:28 2016 -0400 @@ -648,7 +648,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; @@ -688,7 +688,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; @@ -727,7 +727,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; w->outstandingReqs++; diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/GPU.py --- a/src/gpu-compute/GPU.py Wed Oct 26 22:47:49 2016 -0400 +++ b/src/gpu-compute/GPU.py Wed Oct 26 22:48:28 2016 -0400 @@ -135,6 +135,8 @@ vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ "file") + out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery" + " in the GM pipeline") class Shader(ClockedObject): type = 'Shader' diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/compute_unit.cc --- a/src/gpu-compute/compute_unit.cc Wed Oct 26 22:47:49 2016 -0400 +++ b/src/gpu-compute/compute_unit.cc Wed Oct 26 22:48:28 2016 -0400 @@ -1033,17 +1033,7 @@ if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { - assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); - - compute_unit->globalMemoryPipe.getGMLdRespFIFO() - .push(gpuDynInst); - } else { - assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy()); - - compute_unit->globalMemoryPipe.getGMStRespFIFO() - .push(gpuDynInst); - } + compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", compute_unit->cu_id, gpuDynInst->simdId, diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/global_memory_pipeline.cc --- a/src/gpu-compute/global_memory_pipeline.cc Wed Oct 26 22:47:49 2016 -0400 +++ b/src/gpu-compute/global_memory_pipeline.cc Wed Oct 26 22:48:28 2016 -0400 @@ -45,7 +45,8 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), - inflightStores(0), inflightLoads(0) + outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0), + inflightLoads(0) { } @@ -61,8 +62,7 @@ GlobalMemPipeline::exec() { // apply any returned global memory operations - GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() : - !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; + GPUDynInstPtr m = getNextReadyResp(); bool accessVrf = true; Wavefront *w = nullptr; @@ -74,30 +74,19 @@ accessVrf = w->computeUnit->vrf[w->simdId]-> - vrfOperandAccessReady(m->seqNum(), w, m, - VrfAccessType::WRITE); + vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } - if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) && - m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && + if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && accessVrf && m->statusBitVector == VectorMask(0) && (computeUnit->shader->coissue_return || - computeUnit->wfWait.at(m->pipeId).rdy())) { + computeUnit->wfWait.at(m->pipeId).rdy())) { w = m->wavefront(); m->completeAcc(m); - if (m->isLoad() || m->isAtomic()) { - gmReturnedLoads.pop(); - assert(inflightLoads > 0); - --inflightLoads; - } else { - assert(m->isStore()); - gmReturnedStores.pop(); - assert(inflightStores > 0); - --inflightStores; - } + completeRequest(m); // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); @@ -129,15 +118,30 @@ } else { ++inflightLoads; } - } else { + } else if (mp->isStore()) { if (inflightStores >= gmQueueSize) { return; - } else if (mp->isStore()) { + } else { ++inflightStores; } } mp->initiateAcc(mp); + + if (!outOfOrderDataDelivery && !mp->isMemFence()) { + /** + * if we are not in out-of-order data delivery mode + * then we keep the responses sorted in program order. + * in order to do so we must reserve an entry in the + * resp buffer before we issue the request to the mem + * system. mem fence requests will not be stored here + * because once they are issued from the GM pipeline, + * they do not send any response back to it. + */ + gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(), + std::make_pair(mp, false))); + } + gmIssuedRequests.pop(); DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", @@ -145,6 +149,86 @@ } } +GPUDynInstPtr +GlobalMemPipeline::getNextReadyResp() +{ + if (outOfOrderDataDelivery) { + if (!gmReturnedLoads.empty()) { + return gmReturnedLoads.front(); + } else if (!gmReturnedStores.empty()) { + return gmReturnedStores.front(); + } + } else { + if (!gmOrderedRespBuffer.empty()) { + auto mem_req = gmOrderedRespBuffer.begin(); + + if (mem_req->second.second) { + return mem_req->second.first; + } + } + } + + return nullptr; +} + +void +GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst) +{ + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { + assert(inflightLoads > 0); + --inflightLoads; + } else if (gpuDynInst->isStore()) { _______________________________________________ gem5-dev mailing list gem5-dev@gem5.org http://m5sim.org/mailman/listinfo/gem5-dev