Matthew Poremba has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/32859 )
Change subject: gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request
flags
......................................................................
gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request flags
This patch replaces ACQUIRE and RELEASE flags which are HSA-specific.
ACQUIRE flag becomes INV_L1 in VIPER protocol. RELEASE flag is removed.
Future protocols may support extra cache coherence flags like INV_L2 and
WB_L2.
Change-Id: I3d60c9d3625c898f4110a12d81742b6822728533
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32859
Reviewed-by: Jason Lowe-Power <power...@gmail.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/gpu_dyn_inst.hh
M src/mem/request.hh
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/VIPERCoalescer.cc
6 files changed, 60 insertions(+), 39 deletions(-)
Approvals:
Jason Lowe-Power: Looks good to me, approved
Matt Sinclair: Looks good to me, but someone else must approve; Looks
good to me, approved
kokoro: Regressions pass
diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
index dbdaba4..6b3c3a0 100644
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -232,7 +232,7 @@
threadId, nullptr);
acq_req->setPaddr(0);
acq_req->setReqInstSeqNum(tester->getActionSeqNum());
- acq_req->setFlags(Request::ACQUIRE);
+ acq_req->setCacheCoherenceFlags(Request::INV_L1);
// set protocol-specific flags
setExtraRequestFlags(acq_req);
diff --git a/src/gpu-compute/compute_unit.cc
b/src/gpu-compute/compute_unit.cc
index 2787e42..1da5a45 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -805,9 +805,9 @@
// here (simdId=-1, wfSlotId=-1)
if (gpuDynInst->isKernelLaunch()) {
// for kernel launch, the original request must be both
kernel-type
- // and acquire
+ // and INV_L1
assert(pkt->req->isKernel());
- assert(pkt->req->isAcquire());
+ assert(pkt->req->isInvL1());
// one D-Cache inv is done, decrement counter
dispatcher.updateInvCounter(gpuDynInst->kern_id);
@@ -820,16 +820,19 @@
// retrieve wavefront from inst
Wavefront *w = gpuDynInst->wavefront();
- // Check if we are waiting on Kernel End Release
+ // Check if we are waiting on Kernel End Flush
if (w->getStatus() == Wavefront::S_RETURNING
&& gpuDynInst->isEndOfKernel()) {
// for kernel end, the original request must be both
kernel-type
- // and release
+ // and last-level GPU cache should be flushed if it contains
+ // dirty data. This request may have been quiesced and
+ // immediately responded to if the GL2 is a write-through /
+ // read-only cache.
assert(pkt->req->isKernel());
- assert(pkt->req->isRelease());
+ assert(pkt->req->isGL2CacheFlush());
- // one wb done, decrement counter, and return whether all wbs
are
- // done for the kernel
+ // once flush done, decrement counter, and return whether all
+ // dirty writeback operations are done for the kernel
bool isWbDone =
dispatcher.updateWbCounter(gpuDynInst->kern_id);
// not all wbs are done for the kernel, just release pkt
@@ -1218,7 +1221,7 @@
if (kernelMemSync) {
if (gpuDynInst->isKernelLaunch()) {
- req->setCacheCoherenceFlags(Request::ACQUIRE);
+ req->setCacheCoherenceFlags(Request::INV_L1);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
@@ -1234,11 +1237,12 @@
schedule(mem_req_event, curTick() + req_tick_latency);
} else {
- // kernel end release must be enabled
+ // kernel end flush of GL2 cache may be quiesced by Ruby if the
+ // GL2 is a read-only cache
assert(shader->impl_kern_end_rel);
assert(gpuDynInst->isEndOfKernel());
- req->setCacheCoherenceFlags(Request::WB_L2);
+ req->setCacheCoherenceFlags(Request::FLUSH_L2);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
diff --git a/src/gpu-compute/gpu_dyn_inst.hh
b/src/gpu-compute/gpu_dyn_inst.hh
index f34eff6..cdb130e 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -306,7 +306,7 @@
assert(!isEndOfKernel());
// must be wbinv inst if not kernel launch/end
- req->setCacheCoherenceFlags(Request::ACQUIRE);
+ req->setCacheCoherenceFlags(Request::INV_L1);
}
}
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 73c823b..b9d7e14 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -260,30 +260,36 @@
typedef ::Flags<CacheCoherenceFlagsType> CacheCoherenceFlags;
/**
- * These bits are used to set the coherence policy
- * for the GPU and are encoded in the GCN3 instructions.
- * See the AMD GCN3 ISA Architecture Manual for more
- * details.
+ * These bits are used to set the coherence policy for the GPU and are
+ * encoded in the GCN3 instructions. The GCN3 ISA defines two cache
levels
+ * See the AMD GCN3 ISA Architecture Manual for more details.
*
* INV_L1: L1 cache invalidation
- * WB_L2: L2 cache writeback
+ * FLUSH_L2: L2 cache flush
*
- * SLC: System Level Coherent. Accesses are forced to miss in
- * the L2 cache and are coherent with system memory.
+ * Invalidation means to simply discard all cache contents. This can be
+ * done in the L1 since it is implemented as a write-through cache and
+ * there are other copies elsewhere in the hierarchy.
*
- * GLC: Globally Coherent. Controls how reads and writes are
- * handled by the L1 cache. Global here referes to the
- * data being visible globally on the GPU (i.e., visible
- * to all WGs).
+ * For flush the contents of the cache need to be written back to
memory
+ * when dirty and can be discarded otherwise. This operation is more
+ * involved than invalidation and therefore we do not flush caches with
+ * redundant copies of data.
*
- * For atomics, the GLC bit is used to distinguish between
- * between atomic return/no-return operations.
+ * SLC: System Level Coherent. Accesses are forced to miss in the L2
cache
+ * and are coherent with system memory.
+ *
+ * GLC: Globally Coherent. Controls how reads and writes are handled by
+ * the L1 cache. Global here referes to the data being visible
+ * globally on the GPU (i.e., visible to all WGs).
+ *
+ * For atomics, the GLC bit is used to distinguish between between
atomic
+ * return/no-return operations. These flags are used by GPUDynInst.
*/
enum : CacheCoherenceFlagsType {
/** mem_sync_op flags */
INV_L1 = 0x00000001,
- WB_L2 = 0x00000020,
- /** user-policy flags */
+ FLUSH_L2 = 0x00000020,
/** user-policy flags */
SLC_BIT = 0x00000080,
GLC_BIT = 0x00000100,
@@ -938,11 +944,15 @@
/**
* Accessor functions for the memory space configuration flags and
used by
* GPU ISAs such as the Heterogeneous System Architecture (HSA). Note
that
- * these are for testing only; setting extraFlags should be done via
- * setCacheCoherenceFlags().
+ * setting extraFlags should be done via setCacheCoherenceFlags().
*/
- bool isSLC() const { return _cacheCoherenceFlags.isSet(SLC_BIT); }
- bool isGLC() const { return _cacheCoherenceFlags.isSet(GLC_BIT); }
+ bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+
+ bool
+ isGL2CacheFlush() const
+ {
+ return _cacheCoherenceFlags.isSet(FLUSH_L2);
+ }
/**
* Accessor functions to determine whether this request is part of
diff --git a/src/mem/ruby/system/GPUCoalescer.cc
b/src/mem/ruby/system/GPUCoalescer.cc
index 3f73568..b51a9e7 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -587,7 +587,15 @@
assert(pkt->isRead() || pkt->isWrite());
InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
- int num_packets = getDynInst(pkt)->exec_mask.count();
+
+ // in the case of protocol tester, there is one packet per sequence
+ // number. The number of packets during simulation depends on the
+ // number of lanes actives for that vmem request (i.e., the popcnt
+ // of the exec_mask.
+ int num_packets = 1;
+ if (!m_usingRubyTester) {
+ num_packets = getDynInst(pkt)->exec_mask.count();
+ }
// the pkt is temporarily stored in the uncoalesced table until
// it's picked for coalescing process later in this cycle or in a
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc
b/src/mem/ruby/system/VIPERCoalescer.cc
index 69add1b..111f9f2 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -70,20 +70,19 @@
VIPERCoalescer::makeRequest(PacketPtr pkt)
{
// VIPER only supports following memory request types
- // MemSyncReq & Acquire: TCP cache invalidation
+ // MemSyncReq & INV_L1 : TCP cache invalidation
// ReadReq : cache read
// WriteReq : cache write
// AtomicOp : cache atomic
//
// VIPER does not expect MemSyncReq & Release since in GCN3, compute
unit
// does not specify an equivalent type of memory request.
- // TODO: future patches should rename Acquire and Release
- assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
+ assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
pkt->cmd == MemCmd::ReadReq ||
pkt->cmd == MemCmd::WriteReq ||
pkt->isAtomicOp());
- if (pkt->req->isAcquire() && m_cache_inv_pkt) {
+ if (pkt->req->isInvL1() && m_cache_inv_pkt) {
// In VIPER protocol, the coalescer is not able to handle two or
// more cache invalidation requests at a time. Cache invalidation
// requests must be serialized to ensure that all stale data in
@@ -94,8 +93,8 @@
GPUCoalescer::makeRequest(pkt);
- if (pkt->req->isAcquire()) {
- // In VIPER protocol, a compute unit sends a MemSyncReq with
Acquire
+ if (pkt->req->isInvL1()) {
+ // In VIPER protocol, a compute unit sends a MemSyncReq with INV_L1
// flag to invalidate TCP. Upon receiving a request of this type,
// VIPERCoalescer starts a cache walk to invalidate all valid
entries
// in TCP. The request is completed once all entries are
invalidated.
@@ -276,7 +275,7 @@
}
/**
- * Invalidate TCP (Acquire)
+ * Invalidate TCP
*/
void
VIPERCoalescer::invTCP()
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/32859
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I3d60c9d3625c898f4110a12d81742b6822728533
Gerrit-Change-Number: 32859
Gerrit-PatchSet: 14
Gerrit-Owner: Bradford Beckmann <bradford.beckm...@gmail.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: Tuan Ta <q...@cornell.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-CC: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-CC: Pouya Fotouhi <pfoto...@ucdavis.edu>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s