changeset 7d4d424c9f17 in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=7d4d424c9f17
description:
        gpu-compute: support in-order data delivery in GM pipe

        this patch adds an ordered response buffer to the GM pipeline
        to ensure in-order data delivery. the buffer is implemented as
        a stl ordered map, which sorts the request in program order by
        using their sequence ID. when requests return to the GM pipeline
        they are marked as done. only the oldest request may be serviced
        from the ordered buffer, and only if is marked as done.

        the FIFO response buffers are kept and used in OoO delivery mode

diffstat:

 configs/example/apu_se.py                 |    8 +-
 src/arch/hsail/insts/decl.hh              |    2 +-
 src/arch/hsail/insts/mem_impl.hh          |   16 +-
 src/arch/hsail/insts/pseudo_inst.cc       |    6 +-
 src/gpu-compute/GPU.py                    |    2 +
 src/gpu-compute/compute_unit.cc           |   12 +--
 src/gpu-compute/global_memory_pipeline.cc |  124 +++++++++++++++++++++++++----
 src/gpu-compute/global_memory_pipeline.hh |   49 +++++++++++-
 8 files changed, 173 insertions(+), 46 deletions(-)

diffs (truncated from 430 to 300 lines):

diff -r c7453f485a5f -r 7d4d424c9f17 configs/example/apu_se.py
--- a/configs/example/apu_se.py Wed Oct 26 22:47:49 2016 -0400
+++ b/configs/example/apu_se.py Wed Oct 26 22:48:28 2016 -0400
@@ -153,7 +153,9 @@
                   help = 'fast forward using kvm until the m5_switchcpu'
                   ' pseudo-op is encountered, then switch cpus. subsequent'
                   ' m5_switchcpu pseudo-ops will toggle back and forth')
-
+parser.add_option('--outOfOrderDataDelivery', action='store_true',
+                  default=False, help='enable OoO data delivery in the GM'
+                  ' pipeline')
 
 Ruby.define_options(parser)
 
@@ -248,7 +250,9 @@
                                      localDataStore = \
                                      LdsState(banks = options.numLdsBanks,
                                               bankConflictPenalty = \
-                                              options.ldsBankConflictPenalty)))
+                                              options.ldsBankConflictPenalty),
+                                     out_of_order_data_delivery =
+                                             options.outOfOrderDataDelivery))
     wavefronts = []
     vrfs = []
     for j in xrange(options.simds_per_cu):
diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/decl.hh
--- a/src/arch/hsail/insts/decl.hh      Wed Oct 26 22:47:49 2016 -0400
+++ b/src/arch/hsail/insts/decl.hh      Wed Oct 26 22:48:28 2016 -0400
@@ -1082,7 +1082,7 @@
 
                 gpuDynInst->useContinuation = false;
                 GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
-                gmp->getGMReqFIFO().push(gpuDynInst);
+                gmp->issueRequest(gpuDynInst);
 
                 w->wrGmReqsInPipe--;
                 w->rdGmReqsInPipe--;
diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/mem_impl.hh
--- a/src/arch/hsail/insts/mem_impl.hh  Wed Oct 26 22:47:49 2016 -0400
+++ b/src/arch/hsail/insts/mem_impl.hh  Wed Oct 26 22:48:28 2016 -0400
@@ -263,7 +263,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsRdGm++;
             w->rdGmReqsInPipe--;
             break;
@@ -288,7 +288,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsRdGm++;
             w->rdGmReqsInPipe--;
             break;
@@ -312,7 +312,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsRdGm++;
             w->rdGmReqsInPipe--;
             break;
@@ -330,7 +330,7 @@
                     }
                 }
             }
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsRdGm++;
             w->rdGmReqsInPipe--;
             break;
@@ -440,7 +440,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsWrGm++;
             w->wrGmReqsInPipe--;
             break;
@@ -460,7 +460,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsWrGm++;
             w->wrGmReqsInPipe--;
             break;
@@ -486,7 +486,7 @@
                 }
             }
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsWrGm++;
             w->wrGmReqsInPipe--;
             break;
@@ -591,7 +591,7 @@
             m->latency.set(w->computeUnit->shader->ticks(64));
             m->pipeId = GLBMEM_PIPE;
 
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
             w->outstandingReqsWrGm++;
             w->wrGmReqsInPipe--;
             w->outstandingReqsRdGm++;
diff -r c7453f485a5f -r 7d4d424c9f17 src/arch/hsail/insts/pseudo_inst.cc
--- a/src/arch/hsail/insts/pseudo_inst.cc       Wed Oct 26 22:47:49 2016 -0400
+++ b/src/arch/hsail/insts/pseudo_inst.cc       Wed Oct 26 22:48:28 2016 -0400
@@ -648,7 +648,7 @@
 
         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(64));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsWrGm++;
         w->wrGmReqsInPipe--;
         w->outstandingReqsRdGm++;
@@ -688,7 +688,7 @@
 
         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(64));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsWrGm++;
         w->wrGmReqsInPipe--;
         w->outstandingReqsRdGm++;
@@ -727,7 +727,7 @@
 
         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(1));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsRdGm++;
         w->rdGmReqsInPipe--;
         w->outstandingReqs++;
diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/GPU.py
--- a/src/gpu-compute/GPU.py    Wed Oct 26 22:47:49 2016 -0400
+++ b/src/gpu-compute/GPU.py    Wed Oct 26 22:48:28 2016 -0400
@@ -135,6 +135,8 @@
 
     vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                           "file")
+    out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
+                                            " in the GM pipeline")
 
 class Shader(ClockedObject):
     type = 'Shader'
diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/compute_unit.cc
--- a/src/gpu-compute/compute_unit.cc   Wed Oct 26 22:47:49 2016 -0400
+++ b/src/gpu-compute/compute_unit.cc   Wed Oct 26 22:48:28 2016 -0400
@@ -1033,17 +1033,7 @@
                 if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                     gpuDynInst->statusVector.clear();
 
-                if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-                    
assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
-
-                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
-                        .push(gpuDynInst);
-                } else {
-                    
assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
-
-                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
-                        .push(gpuDynInst);
-                }
+                compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
 
                 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
                         compute_unit->cu_id, gpuDynInst->simdId,
diff -r c7453f485a5f -r 7d4d424c9f17 src/gpu-compute/global_memory_pipeline.cc
--- a/src/gpu-compute/global_memory_pipeline.cc Wed Oct 26 22:47:49 2016 -0400
+++ b/src/gpu-compute/global_memory_pipeline.cc Wed Oct 26 22:48:28 2016 -0400
@@ -45,7 +45,8 @@
 
 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
     computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    inflightStores(0), inflightLoads(0)
+    outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+    inflightLoads(0)
 {
 }
 
@@ -61,8 +62,7 @@
 GlobalMemPipeline::exec()
 {
     // apply any returned global memory operations
-    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
-        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+    GPUDynInstPtr m = getNextReadyResp();
 
     bool accessVrf = true;
     Wavefront *w = nullptr;
@@ -74,30 +74,19 @@
 
         accessVrf =
             w->computeUnit->vrf[w->simdId]->
-            vrfOperandAccessReady(m->seqNum(), w, m,
-                                  VrfAccessType::WRITE);
+                vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
     }
 
-    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
-        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
         accessVrf && m->statusBitVector == VectorMask(0) &&
         (computeUnit->shader->coissue_return ||
-         computeUnit->wfWait.at(m->pipeId).rdy())) {
+        computeUnit->wfWait.at(m->pipeId).rdy())) {
 
         w = m->wavefront();
 
         m->completeAcc(m);
 
-        if (m->isLoad() || m->isAtomic()) {
-            gmReturnedLoads.pop();
-            assert(inflightLoads > 0);
-            --inflightLoads;
-        } else {
-            assert(m->isStore());
-            gmReturnedStores.pop();
-            assert(inflightStores > 0);
-            --inflightStores;
-        }
+        completeRequest(m);
 
         // Decrement outstanding register count
         computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
@@ -129,15 +118,30 @@
             } else {
                 ++inflightLoads;
             }
-        } else {
+        } else if (mp->isStore()) {
             if (inflightStores >= gmQueueSize) {
                 return;
-            } else if (mp->isStore()) {
+            } else {
                 ++inflightStores;
             }
         }
 
         mp->initiateAcc(mp);
+
+        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+            /**
+             * if we are not in out-of-order data delivery mode
+             * then we keep the responses sorted in program order.
+             * in order to do so we must reserve an entry in the
+             * resp buffer before we issue the request to the mem
+             * system. mem fence requests will not be stored here
+             * because once they are issued from the GM pipeline,
+             * they do not send any response back to it.
+             */
+            gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
+                std::make_pair(mp, false)));
+        }
+
         gmIssuedRequests.pop();
 
         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
@@ -145,6 +149,86 @@
     }
 }
 
+GPUDynInstPtr
+GlobalMemPipeline::getNextReadyResp()
+{
+    if (outOfOrderDataDelivery) {
+        if (!gmReturnedLoads.empty()) {
+            return gmReturnedLoads.front();
+        } else if (!gmReturnedStores.empty()) {
+            return gmReturnedStores.front();
+        }
+    } else {
+        if (!gmOrderedRespBuffer.empty()) {
+            auto mem_req = gmOrderedRespBuffer.begin();
+
+            if (mem_req->second.second) {
+                return mem_req->second.first;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+void
+GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
+{
+    if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+        assert(inflightLoads > 0);
+        --inflightLoads;
+    } else if (gpuDynInst->isStore()) {
_______________________________________________
gem5-dev mailing list
gem5-dev@gem5.org
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to