[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Read registers in execute instead of initiateAcc

2021-07-07 Thread Matt Sinclair (Gerrit) via gem5-dev
Matt Sinclair has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/45345 )


Change subject: arch-gcn3: Read registers in execute instead of initiateAcc
..

arch-gcn3: Read registers in execute instead of initiateAcc

Certain memory writes were reading their registers in
initiateAcc, which lead to scenarios where a subsequent instruction
would execute, clobbering the value in that register before the memory
writes' initiateAcc method was called, causing the memory write to read
wrong data.

This patch moves all register reads to execute, preventing the above
scenario from happening.

Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/45345
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
Reviewed-by: Matthew Poremba 
Reviewed-by: Alex Dutu 
Maintainer: Matt Sinclair 
---
M src/arch/amdgpu/gcn3/insts/instructions.cc
1 file changed, 116 insertions(+), 125 deletions(-)

Approvals:
  Alex Dutu: Looks good to me, approved
  Matthew Poremba: Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve; Looks  
good to me, approved

  kokoro: Regressions pass



diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc  
b/src/arch/amdgpu/gcn3/insts/instructions.cc

index b5a4300..8c77b8c 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -5068,8 +5068,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+sizeof(ScalarRegU32));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5093,10 +5098,6 @@
 void
 Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-sizeof(ScalarRegU32));
 initMemWrite<1>(gpuDynInst);
 } // initiateAcc

@@ -5127,8 +5128,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+sizeof(ScalarRegU64));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5152,10 +5158,6 @@
 void
 Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-sizeof(ScalarRegU64));
 initMemWrite<2>(gpuDynInst);
 } // initiateAcc

@@ -5186,8 +5188,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+4 * sizeof(ScalarRegU32));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5211,10 +5218,6 @@
 void
 Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-4 * sizeof(ScalarRegU32));
 initMemWrite<4>(gpuDynInst);
 } // initiateAcc

@@ -35746,9 +35749,18 @@
 ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
 ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
 ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+ConstVecOperandI8 data(gpuDynInst, extData.VDATA);

 rsrcDesc.read();
 offset.read();
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->d_data))[lane]
+= data[lane];
+}
+}

 int inst_offset = instData.OFFSET;

@@ -35793,16 +35805,6 @@
 void
 Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-data.read();
-
-for (int lane = 0; lane < NumVecElemPerVecR

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Read registers in execute instead of initiateAcc

2021-05-11 Thread Kyle Roarty (Gerrit) via gem5-dev
Kyle Roarty has uploaded this change for review. (  
https://gem5-review.googlesource.com/c/public/gem5/+/45345 )



Change subject: arch-gcn3: Read registers in execute instead of initiateAcc
..

arch-gcn3: Read registers in execute instead of initiateAcc

Certain memory writes were reading their registers in
initiateAcc, which lead to scenarios where a subsequent instruction
would execute, clobbering the value in that register before the memory
writes' initiateAcc method was called, causing the memory write to read
wrong data.

This patch moves all register reads to execute, preventing the above
scenario from happening.

Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83
---
M src/arch/amdgpu/gcn3/insts/instructions.cc
1 file changed, 116 insertions(+), 125 deletions(-)



diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc  
b/src/arch/amdgpu/gcn3/insts/instructions.cc

index 8cadff7..4ae4c29 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -5065,8 +5065,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+sizeof(ScalarRegU32));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5090,10 +5095,6 @@
 void
 Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-sizeof(ScalarRegU32));
 initMemWrite<1>(gpuDynInst);
 } // initiateAcc

@@ -5124,8 +5125,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+sizeof(ScalarRegU64));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5149,10 +5155,6 @@
 void
 Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-sizeof(ScalarRegU64));
 initMemWrite<2>(gpuDynInst);
 } // initiateAcc

@@ -5183,8 +5185,13 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
 ScalarRegU32 offset(0);
 ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);

 addr.read();
+sdata.read();
+
+std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+4 * sizeof(ScalarRegU32));

 if (instData.IMM) {
 offset = extData.OFFSET;
@@ -5208,10 +5215,6 @@
 void
 Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
-sdata.read();
-std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-4 * sizeof(ScalarRegU32));
 initMemWrite<4>(gpuDynInst);
 } // initiateAcc

@@ -35743,9 +35746,18 @@
 ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
 ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
 ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+ConstVecOperandI8 data(gpuDynInst, extData.VDATA);

 rsrcDesc.read();
 offset.read();
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->d_data))[lane]
+= data[lane];
+}
+}

 int inst_offset = instData.OFFSET;

@@ -35790,16 +35802,6 @@
 void
 Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-data.read();
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (gpuDynInst->exec_mask[lane]) {
-(reinterpret_cast(gpuDynInst->d_data))[lane]
-= data[lane];
-}
-}
-
 initMemWrite(gpuDynInst);
 } // initiateAcc

@@ -35839,9 +35841,18 @@
 ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
 ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);