Matthew Poremba has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/57711 )

 (

16 is the latest approved patch-set.
No files were changed between the latest approved patch-set and the submitted one. )Change subject: gpu-compute: Support Scalar and Vector access to system pages
......................................................................

gpu-compute: Support Scalar and Vector access to system pages

The amdgpu driver supports reading and writing scalar and vector memory
addresses that reside in system memory. This is commonly used for things
like blit kernels that perform host-to-device or device-to-host copies
using GPU load/store instructions.

This is done by utilizing the system hub device added in a prior
changeset. Memory packets translated by the Scalar or VMEM TLBs will
have the correspoding system request field set from the PTE in the TLB
which can be used in the compute unit to determine if a request is for
system memory or not.

Another important change is to return global memory tokens for system
requests. Since these do not flow through the GPU coalescer where the
token is returned, the token can be returned once the request is known
to be a system request.

Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57711
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/gpu_dyn_inst.hh
4 files changed, 154 insertions(+), 9 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass




diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index e1794a8..6af45fe 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -112,6 +112,12 @@
     scheduleToExecute(p),
     stats(this, p.n_wf)
 {
+ // This is not currently supported and would require adding more handling
+    // for system vs. device memory requests on the functional paths, so we
+    // fatal immediately in the constructor if this configuration is seen.
+    fatal_if(functionalTLB && FullSystem,
+             "Functional TLB not supported in full-system GPU simulation");
+
     /**
      * This check is necessary because std::bitset only provides conversion
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
@@ -801,6 +807,12 @@
 bool
 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
 {
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
+{
     // Ruby has completed the memory op. Schedule the mem_resp_event at the
     // appropriate cycle to process the timing memory response
     // This delay represents the pipeline delay
@@ -902,6 +914,12 @@
 bool
 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 {
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
+{
     assert(!pkt->req->isKernel());

     // retrieve sender state
@@ -1241,9 +1259,13 @@
     assert(gpuDynInst->isGlobalSeg() ||
            gpuDynInst->executedAs() == enums::SC_GLOBAL);

+    // Fences will never be issued to system memory, so we can mark the
+    // requestor as a device memory ID here.
     if (!req) {
         req = std::make_shared<Request>(
-            0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
+            0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(vramRequestorId());
     }

     // all mem sync requests have Paddr == 0
@@ -1544,6 +1566,24 @@
             new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
                                                    nullptr);

+    // Set VRAM ID for device requests
+    // For now, system vmem requests use functional reads. This is not that
+ // critical to model as the region of interest should always be accessing
+    // device memory. System vmem requests are used by blit kernels to do
+    // memcpys and load code objects into device memory.
+    if (new_pkt->req->systemReq()) {
+        // There will be multiple packets returned for the same gpuDynInst,
+        // so first check if systemReq is not already set and if so, return
+        // the token acquired when the dispatch list is filled as system
+        // requests do not require a GPU coalescer token.
+        if (!gpuDynInst->isSystemReq()) {
+            computeUnit->getTokenManager()->recvTokens(1);
+            gpuDynInst->setSystemReq();
+        }
+    } else {
+        new_pkt->req->requestorId(computeUnit->vramRequestorId());
+    }
+
     // translation is done. Schedule the mem_req_event at the appropriate
     // cycle to send the timing memory request to ruby
     EventFunctionWrapper *mem_req_event =
@@ -1582,7 +1622,11 @@
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
     [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;

-    if (!(sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+        SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(sendTimingReq(pkt))) {
         retries.push_back(std::make_pair(pkt, gpuDynInst));

         DPRINTF(GPUPort,
@@ -1611,7 +1655,11 @@
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
[[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;

-    if (!(scalarDataPort.sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+ SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(scalarDataPort.sendTimingReq(pkt))) {
         scalarDataPort.retries.push_back(pkt);

         DPRINTF(GPUPort,
@@ -1712,15 +1760,26 @@
     req_pkt->senderState =
         new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);

-    if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
-        computeUnit->scalarDataPort.retries.push_back(req_pkt);
-        DPRINTF(GPUMem, "send scalar req failed for: %s\n",
-                gpuDynInst->disassemble());
+    // For a system request we want to mark the GPU instruction as a system
+ // load/store so that after the request is issued to system memory we can
+    // return any token acquired for the request. Since tokens are returned
+ // by the coalescer and system requests do not take that path, this needs
+    // to be tracked.
+    //
+    // Device requests change the requestor ID to something in the device
+    // memory Ruby network.
+    if (req_pkt->req->systemReq()) {
+        gpuDynInst->setSystemReq();
     } else {
-        DPRINTF(GPUMem, "send scalar req for: %s\n",
-                gpuDynInst->disassemble());
+        req_pkt->req->requestorId(computeUnit->vramRequestorId());
     }

+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
+            = new ComputeUnit::ScalarDataPort::MemReqEvent
+                (computeUnit->scalarDataPort, req_pkt);
+    computeUnit->schedule(scalar_mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
     return true;
 }

diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 1c211d9..a080e3d 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -529,6 +529,28 @@
                   saved(sender_state) { }
         };

+        class SystemHubEvent : public Event
+        {
+          DataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
         void processMemReqEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);

@@ -537,6 +559,8 @@

         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

+        bool handleResponse(PacketPtr pkt);
+
       protected:
         ComputeUnit *computeUnit;

@@ -596,6 +620,30 @@
             const char *description() const;
         };

+        class SystemHubEvent : public Event
+        {
+          ScalarDataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
+        bool handleResponse(PacketPtr pkt);
+
         std::deque<PacketPtr> retries;

       private:
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index f766f7e..08303a4 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -62,6 +62,10 @@
 bool
 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
 {
+    // System requests do not need GPU coalescer tokens. Make sure nothing
+    // has bypassed the operand gather check stage.
+    assert(!mp->isSystemReq());
+
     // We require one token from the coalescer's uncoalesced table to
     // proceed
     int token_count = 1;
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 17c7ee6..e2884a0 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -476,11 +476,16 @@

     // inst used to save/restore a wavefront context
     bool isSaveRestore;
+
+    bool isSystemReq() { return systemReq; }
+    void setSystemReq() { systemReq = true; }
+
   private:
     GPUStaticInst *_staticInst;
     const InstSeqNum _seqNum;
     int maxSrcVecRegOpSize;
     int maxSrcScalarRegOpSize;
+    bool systemReq = false;

     // the time the request was started
     Tick accessTime = -1;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/57711
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e
Gerrit-Change-Number: 57711
Gerrit-PatchSet: 22
Gerrit-Owner: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: Kyle Roarty <kyleroarty1...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to