[gem5-dev] Change in gem5/gem5[develop]: configs,gpu-compute: Support fetch from system pages
Matthew Poremba has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/57652 ) Change subject: configs,gpu-compute: Support fetch from system pages .. configs,gpu-compute: Support fetch from system pages The amdgpu driver supports fetching instructions from pages which reside in system memory rather than device memory. This changeset adds support to do this by adding the system hub object added in a prior changeset to the fetch unit and issues requests to the system hub if the system bit in the memory page's PTE is set. Otherwise, the requestor ID is set to be device memory and the request is routed through the Ruby network / GPU caches to fetch the instructions. Change-Id: Ib2fb47c589fdd5e544ab6493d7dbd8f2d9d7b0e8 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57652 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- M configs/example/gpufs/system/system.py M src/gpu-compute/GPU.py M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/shader.cc M src/gpu-compute/shader.hh 8 files changed, 87 insertions(+), 3 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 8c9895f..972a4f9 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -133,6 +133,10 @@ gpu_mem_mgr = AMDGPUMemoryManager() system.pc.south_bridge.gpu.memory_manager = gpu_mem_mgr +# CPU data path (SystemHub) +system_hub = AMDGPUSystemHub() +shader.system_hub = system_hub + # GPU, HSAPP, and GPUCommandProc are DMA devices system._dma_ports.append(gpu_hsapp) system._dma_ports.append(gpu_cmd_proc) @@ -141,6 +145,7 @@ system._dma_ports.append(sdma1) system._dma_ports.append(device_ih) system._dma_ports.append(pm4_pkt_proc) +system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) system._dma_ports.append(cp_pt_walker) @@ -154,6 +159,7 @@ sdma1.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports pm4_pkt_proc.pio = system.iobus.mem_side_ports +system_hub.pio = system.iobus.mem_side_ports # Full system needs special TLBs for SQC, Scalar, and vector data ports args.full_system = True diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 3e5fba6..a0154a7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -224,6 +224,7 @@ CUs = VectorParam.ComputeUnit('Number of compute units') gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU') dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher') +system_hub = Param.AMDGPUSystemHub(NULL, 'GPU System Hub (FS Mode only)') n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into ruby at kernel launch""") diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index cc6244b..e1794a8 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -979,11 +979,18 @@ bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { -computeUnit->fetchStage.processFetchReturn(pkt); +computeUnit->handleSQCReturn(pkt); + return true; } void +ComputeUnit::handleSQCReturn(PacketPtr pkt) +{ +fetchStage.processFetchReturn(pkt); +} + +void ComputeUnit::SQCPort::recvReqRetry() { int len = retries.size(); diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 87ed541..1c211d9 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -463,6 +463,8 @@ bool isDone() const; bool isVectorAluIdle(uint32_t simdId) const; +void handleSQCReturn(PacketPtr pkt); + protected: RequestorID _requestorId; diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index 6e35818..640e29b 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -206,6 +206,15 @@ computeUnit.sqcTLBPort.sendFunctional(pkt); +/** + * For full system, if this is a device request we need to set the + * requestor ID of the packet to the GPU memory manager so it is routed + * through Ruby as a memory request and not a PIO request. + */ +if (!pkt->req->systemReq()) { +pkt->req->requestorId(computeUnit.vramRequestorId()); +} + GpuTranslationState *sender_state = safe_cast(pkt->senderState); @@ -250,6 +259,15 @@ } /** + * For fu
[gem5-dev] Change in gem5/gem5[develop]: configs,gpu-compute: Support fetch from system pages
Matthew Poremba has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/57652 ) Change subject: configs,gpu-compute: Support fetch from system pages .. configs,gpu-compute: Support fetch from system pages The amdgpu driver supports fetching instructions from pages which reside in system memory rather than device memory. This changeset adds support to do this by adding the system hub object added in a prior changeset to the fetch unit and issues requests to the system hub if the system bit in the memory page's PTE is set. Otherwise, the requestor ID is set to be device memory and the request is routed through the Ruby network / GPU caches to fetch the instructions. Change-Id: Ib2fb47c589fdd5e544ab6493d7dbd8f2d9d7b0e8 --- M configs/example/gpufs/system/system.py M src/gpu-compute/GPU.py M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/shader.cc M src/gpu-compute/shader.hh 8 files changed, 83 insertions(+), 3 deletions(-) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 81dbd2d..6bd9023 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -133,6 +133,10 @@ gpu_mem_mgr = AMDGPUMemoryManager() system.pc.south_bridge.gpu.memory_manager = gpu_mem_mgr +# CPU data path (SystemHub) +system_hub = AMDGPUSystemHub() +shader.system_hub = system_hub + # GPU, HSAPP, and GPUCommandProc are DMA devices system._dma_ports.append(gpu_hsapp) system._dma_ports.append(gpu_cmd_proc) @@ -141,6 +145,7 @@ system._dma_ports.append(sdma1) system._dma_ports.append(device_ih) system._dma_ports.append(pm4_pkt_proc) +system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) system._dma_ports.append(cp_pt_walker) @@ -154,6 +159,7 @@ sdma1.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports pm4_pkt_proc.pio = system.iobus.mem_side_ports +system_hub.pio = system.iobus.mem_side_ports # Full system needs special TLBs for SQC, Scalar, and vector data ports args.full_system = True diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 3e5fba6..a0154a7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -224,6 +224,7 @@ CUs = VectorParam.ComputeUnit('Number of compute units') gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU') dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher') +system_hub = Param.AMDGPUSystemHub(NULL, 'GPU System Hub (FS Mode only)') n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into ruby at kernel launch""") diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 11802d2..b40b464 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -979,11 +979,18 @@ bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { -computeUnit->fetchStage.processFetchReturn(pkt); +computeUnit->handleSQCReturn(pkt); + return true; } void +ComputeUnit::handleSQCReturn(PacketPtr pkt) +{ +fetchStage.processFetchReturn(pkt); +} + +void ComputeUnit::SQCPort::recvReqRetry() { int len = retries.size(); diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 87ed541..1c211d9 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -463,6 +463,8 @@ bool isDone() const; bool isVectorAluIdle(uint32_t simdId) const; +void handleSQCReturn(PacketPtr pkt); + protected: RequestorID _requestorId; diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index 6e35818..640e29b 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -206,6 +206,15 @@ computeUnit.sqcTLBPort.sendFunctional(pkt); +/** + * For full system, if this is a device request we need to set the + * requestor ID of the packet to the GPU memory manager so it is routed + * through Ruby as a memory request and not a PIO request. + */ +if (!pkt->req->systemReq()) { +pkt->req->requestorId(computeUnit.vramRequestorId()); +} + GpuTranslationState *sender_state = safe_cast(pkt->senderState); @@ -250,6 +259,15 @@ } /** + * For full system, if this is a device request we need to set the + * requestor ID of the packet to the GPU memory manager so it is routed + * through Ruby as a memory request and not a PIO request. + */ +if (!pkt->req->systemReq()) { +pkt-