[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Read registers in execute instead of initiateAcc
Matt Sinclair has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/45345 ) Change subject: arch-gcn3: Read registers in execute instead of initiateAcc .. arch-gcn3: Read registers in execute instead of initiateAcc Certain memory writes were reading their registers in initiateAcc, which lead to scenarios where a subsequent instruction would execute, clobbering the value in that register before the memory writes' initiateAcc method was called, causing the memory write to read wrong data. This patch moves all register reads to execute, preventing the above scenario from happening. Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/45345 Tested-by: kokoro Reviewed-by: Matt Sinclair Reviewed-by: Matthew Poremba Reviewed-by: Alex Dutu Maintainer: Matt Sinclair --- M src/arch/amdgpu/gcn3/insts/instructions.cc 1 file changed, 116 insertions(+), 125 deletions(-) Approvals: Alex Dutu: Looks good to me, approved Matthew Poremba: Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index b5a4300..8c77b8c 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -5068,8 +5068,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +sizeof(ScalarRegU32)); if (instData.IMM) { offset = extData.OFFSET; @@ -5093,10 +5098,6 @@ void Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -sizeof(ScalarRegU32)); initMemWrite<1>(gpuDynInst); } // initiateAcc @@ -5127,8 +5128,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +sizeof(ScalarRegU64)); if (instData.IMM) { offset = extData.OFFSET; @@ -5152,10 +5158,6 @@ void Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -sizeof(ScalarRegU64)); initMemWrite<2>(gpuDynInst); } // initiateAcc @@ -5186,8 +5188,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +4 * sizeof(ScalarRegU32)); if (instData.IMM) { offset = extData.OFFSET; @@ -5211,10 +5218,6 @@ void Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -4 * sizeof(ScalarRegU32)); initMemWrite<4>(gpuDynInst); } // initiateAcc @@ -35746,9 +35749,18 @@ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); +ConstVecOperandI8 data(gpuDynInst, extData.VDATA); rsrcDesc.read(); offset.read(); +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->d_data))[lane] += data[lane]; +} +} int inst_offset = instData.OFFSET; @@ -35793,16 +35805,6 @@ void Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstVecOperandI8 data(gpuDynInst, extData.VDATA); -data.read(); - -for (int lane = 0; lane < NumVecElemPerVecR
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Read registers in execute instead of initiateAcc
Kyle Roarty has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/45345 ) Change subject: arch-gcn3: Read registers in execute instead of initiateAcc .. arch-gcn3: Read registers in execute instead of initiateAcc Certain memory writes were reading their registers in initiateAcc, which lead to scenarios where a subsequent instruction would execute, clobbering the value in that register before the memory writes' initiateAcc method was called, causing the memory write to read wrong data. This patch moves all register reads to execute, preventing the above scenario from happening. Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83 --- M src/arch/amdgpu/gcn3/insts/instructions.cc 1 file changed, 116 insertions(+), 125 deletions(-) diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index 8cadff7..4ae4c29 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -5065,8 +5065,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +sizeof(ScalarRegU32)); if (instData.IMM) { offset = extData.OFFSET; @@ -5090,10 +5095,6 @@ void Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -sizeof(ScalarRegU32)); initMemWrite<1>(gpuDynInst); } // initiateAcc @@ -5124,8 +5125,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +sizeof(ScalarRegU64)); if (instData.IMM) { offset = extData.OFFSET; @@ -5149,10 +5155,6 @@ void Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -sizeof(ScalarRegU64)); initMemWrite<2>(gpuDynInst); } // initiateAcc @@ -5183,8 +5185,13 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ScalarRegU32 offset(0); ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1); +ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA); addr.read(); +sdata.read(); + +std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), +4 * sizeof(ScalarRegU32)); if (instData.IMM) { offset = extData.OFFSET; @@ -5208,10 +5215,6 @@ void Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA); -sdata.read(); -std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(), -4 * sizeof(ScalarRegU32)); initMemWrite<4>(gpuDynInst); } // initiateAcc @@ -35743,9 +35746,18 @@ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); +ConstVecOperandI8 data(gpuDynInst, extData.VDATA); rsrcDesc.read(); offset.read(); +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->d_data))[lane] += data[lane]; +} +} int inst_offset = instData.OFFSET; @@ -35790,16 +35802,6 @@ void Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstVecOperandI8 data(gpuDynInst, extData.VDATA); -data.read(); - -for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { -if (gpuDynInst->exec_mask[lane]) { -(reinterpret_cast(gpuDynInst->d_data))[lane] -= data[lane]; -} -} - initMemWrite(gpuDynInst); } // initiateAcc @@ -35839,9 +35841,18 @@ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);