Matt Sinclair has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/31974 )
Change subject: arch-gcn3: add support for flat atomic adds, subs, incs,
decs
......................................................................
arch-gcn3: add support for flat atomic adds, subs, incs, decs
Add support for all missing flat atomic adds, subtracts, increments,
and decrements, including their x2 variants.
Change-Id: I37a67fcacca91a09a82be6597facaa366105d2dc
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/31974
Reviewed-by: Anthony Gutierrez <anthony.gutier...@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutier...@amd.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/instructions.hh
2 files changed, 410 insertions(+), 6 deletions(-)
Approvals:
Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass
diff --git a/src/arch/gcn3/insts/instructions.cc
b/src/arch/gcn3/insts/instructions.cc
index 426f991..6e81e2c 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -40643,8 +40643,72 @@
void
Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+ void
+ Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU32>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT
*iFmt)
: Inst_FLAT(iFmt, "flat_atomic_smin")
@@ -40843,9 +40907,74 @@
void
Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+ void
+ Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU32>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
+
Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT
*iFmt)
: Inst_FLAT(iFmt, "flat_atomic_dec")
{
@@ -40868,9 +40997,74 @@
void
Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+ void
+ Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU32>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
+
Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_swap_x2")
@@ -41118,9 +41312,75 @@
void
Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+ void
+ Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU64>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU64*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
+
Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_smin_x2")
@@ -41326,9 +41586,75 @@
void
Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+ void
+ Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU64>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU64*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
+
Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_dec_x2")
@@ -41353,6 +41679,72 @@
void
Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
+ wf->wrGmReqsInPipe--;
+ wf->rdGmReqsInPipe--;
+ return;
+ }
+
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+ addr.read();
+ data.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+ = data[lane];
+ }
+ }
+
+ if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
+ gpuDynInst->computeUnit()->globalMemoryPipe.
+ issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ } else {
+ fatal("Non global flat instructions not implemented yet.\n");
+ }
+
+ gpuDynInst->wavefront()->outstandingReqs++;
+ gpuDynInst->wavefront()->validateRequestCounters();
}
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ initAtomicAccess<VecElemU64>(gpuDynInst);
+ } // initiateAcc
+
+ void
+ Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ if (isAtomicRet()) {
+ VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst[lane] = (reinterpret_cast<VecElemU64*>(
+ gpuDynInst->d_data))[lane];
+ }
+ }
+
+ vdst.write();
+ }
+ } // completeAcc
} // namespace Gcn3ISA
diff --git a/src/arch/gcn3/insts/instructions.hh
b/src/arch/gcn3/insts/instructions.hh
index f561043..471c130 100644
--- a/src/arch/gcn3/insts/instructions.hh
+++ b/src/arch/gcn3/insts/instructions.hh
@@ -80189,6 +80189,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB
class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT
@@ -80717,6 +80719,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC
class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT
@@ -80783,6 +80787,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC
class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT
@@ -81051,6 +81057,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB_X2
class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT
@@ -81579,6 +81587,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC_X2
class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT
@@ -81645,6 +81655,8 @@
} // isDstOperand
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
} // namespace Gcn3ISA
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/31974
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I37a67fcacca91a09a82be6597facaa366105d2dc
Gerrit-Change-Number: 31974
Gerrit-PatchSet: 2
Gerrit-Owner: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Alexandru Duțu <alexandru.d...@amd.com>
Gerrit-Reviewer: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Bradford Beckmann <brad.beckm...@amd.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-CC: GAURAV JAIN <gja...@wisc.edu>
Gerrit-CC: Kyle Roarty <kyleroarty1...@gmail.com>
Gerrit-CC: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-CC: Pouya Fotouhi <pfoto...@ucdavis.edu>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s