Matthew Poremba has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/55344 )
Change subject: arch-vega: Implement large ds_read/write instructions
......................................................................
arch-vega: Implement large ds_read/write instructions
Port large DS read/write instructions from
https://gem5-review.googlesource.com/c/public/gem5/+/48342.
This implements the 96 and 128b ds_read/write instructions in a similar
fashion to the 3 and 4 dword flat_load/store instructions.
These instructions are treated as reads/writes of 3 or 4 dwords, instead
of as a single 96b/128b memory transaction, due to the limitations of
the VecOperand class used in the amdgpu code.
In order to handle treating the memory transaction as multiple dwords,
the patch also adds in new initMemRead/initMemWrite functions for ds
instructions. These are similar to the functions used in flat
instructions for the same purpose.
Change-Id: Iee2de14eb7f32b6654799d53dc97d806288af98f
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/55344
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/arch/amdgpu/vega/insts/op_encodings.hh
M src/arch/amdgpu/vega/insts/instructions.cc
M src/arch/amdgpu/vega/insts/instructions.hh
3 files changed, 260 insertions(+), 4 deletions(-)
Approvals:
Matt Sinclair: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc
b/src/arch/amdgpu/vega/insts/instructions.cc
index 56ccc3d..7d55624 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -37766,8 +37766,51 @@
void
Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(
+ gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+ ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+ ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+ ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+ addr.read();
+ data0.read();
+ data1.read();
+ data2.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4] = data0[lane];
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+ }
+ }
+
+
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
+
+ void
+ Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Addr offset0 = instData.OFFSET0;
+ Addr offset1 = instData.OFFSET1;
+ Addr offset = (offset1 << 8) | offset0;
+
+ initMemWrite<3>(gpuDynInst, offset);
+ } // initiateAcc
+
+ void
+ Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ } // completeAcc
// --- Inst_DS__DS_WRITE_B128 class methods ---
Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
@@ -37787,8 +37830,55 @@
void
Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(
+ gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+ ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+ ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+ ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+ ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+ addr.read();
+ data0.read();
+ data1.read();
+ data2.read();
+ data3.read();
+
+ calcAddr(gpuDynInst, addr);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4] = data0[lane];
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+ }
+ }
+
+
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
+
+ void
+ Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Addr offset0 = instData.OFFSET0;
+ Addr offset1 = instData.OFFSET1;
+ Addr offset = (offset1 << 8) | offset0;
+
+ initMemWrite<4>(gpuDynInst, offset);
+ } // initiateAcc
+
+ void
+ Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ } // completeAcc
// --- Inst_DS__DS_READ_B96 class methods ---
Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
@@ -37807,8 +37897,52 @@
void
Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(
+ gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+ ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+ addr.read();
+
+ calcAddr(gpuDynInst, addr);
+
+
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
+
+ void
+ Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Addr offset0 = instData.OFFSET0;
+ Addr offset1 = instData.OFFSET1;
+ Addr offset = (offset1 << 8) | offset0;
+
+ initMemRead<3>(gpuDynInst, offset);
+ }
+
+ void
+ Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+ VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+ VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4];
+ vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 1];
+ vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 2];
+ }
+ }
+
+ vdst0.write();
+ vdst1.write();
+ vdst2.write();
+ }
// --- Inst_DS__DS_READ_B128 class methods ---
Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
@@ -37827,8 +37961,56 @@
void
Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(
+ gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+ ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+ addr.read();
+
+ calcAddr(gpuDynInst, addr);
+
+
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
+
+ void
+ Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Addr offset0 = instData.OFFSET0;
+ Addr offset1 = instData.OFFSET1;
+ Addr offset = (offset1 << 8) | offset0;
+
+ initMemRead<4>(gpuDynInst, offset);
+ } // initiateAcc
+
+ void
+ Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+ {
+ VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+ VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+ VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+ VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4];
+ vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 1];
+ vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 2];
+ vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 3];
+ }
+ }
+
+ vdst0.write();
+ vdst1.write();
+ vdst2.write();
+ vdst3.write();
+ } // completeAcc
// --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
Inst_MUBUF__BUFFER_LOAD_FORMAT_X
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh
b/src/arch/amdgpu/vega/insts/instructions.hh
index 95c66ba..e14f52f 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -35542,6 +35542,8 @@
} // getOperandSize
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_WRITE_B96
class Inst_DS__DS_WRITE_B128 : public Inst_DS
@@ -35574,6 +35576,8 @@
} // getOperandSize
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_WRITE_B128
class Inst_DS__DS_READ_B96 : public Inst_DS
@@ -35606,6 +35610,8 @@
} // getOperandSize
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_B96
class Inst_DS__DS_READ_B128 : public Inst_DS
@@ -35638,6 +35644,8 @@
} // getOperandSize
void execute(GPUDynInstPtr) override;
+ void initiateAcc(GPUDynInstPtr) override;
+ void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_B128
class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh
b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 109e680..bb45b74 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -414,6 +414,25 @@
}
}
+ template<int N>
+ void
+ initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
+ {
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane] + offset;
+ for (int i = 0; i < N; ++i) {
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * N + i]
+ = wf->ldsChunk->read<VecElemU32>(
+ vaddr + i*sizeof(VecElemU32));
+ }
+ }
+ }
+ }
+
template<typename T>
void
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr
offset1)
@@ -448,6 +467,25 @@
}
}
+ template<int N>
+ void
+ initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
+ {
+ Wavefront *wf = gpuDynInst->wavefront();
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane] + offset;
+ for (int i = 0; i < N; ++i) {
+ wf->ldsChunk->write<VecElemU32>(
+ vaddr + i*sizeof(VecElemU32),
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * N + i]);
+ }
+ }
+ }
+ }
+
template<typename T>
void
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr
offset1)
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/55344
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Iee2de14eb7f32b6654799d53dc97d806288af98f
Gerrit-Change-Number: 55344
Gerrit-PatchSet: 2
Gerrit-Owner: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: Kyle Roarty <kyleroarty1...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s