Matt Sinclair has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/45345 )

Change subject: arch-gcn3: Read registers in execute instead of initiateAcc
......................................................................

arch-gcn3: Read registers in execute instead of initiateAcc

Certain memory writes were reading their registers in
initiateAcc, which lead to scenarios where a subsequent instruction
would execute, clobbering the value in that register before the memory
writes' initiateAcc method was called, causing the memory write to read
wrong data.

This patch moves all register reads to execute, preventing the above
scenario from happening.

Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/45345
Tested-by: kokoro <noreply+kok...@google.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Reviewed-by: Matthew Poremba <matthew.pore...@amd.com>
Reviewed-by: Alex Dutu <alexandru.d...@amd.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
---
M src/arch/amdgpu/gcn3/insts/instructions.cc
1 file changed, 116 insertions(+), 125 deletions(-)

Approvals:
  Alex Dutu: Looks good to me, approved
  Matthew Poremba: Looks good to me, approved
Matt Sinclair: Looks good to me, but someone else must approve; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index b5a4300..8c77b8c 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -5068,8 +5068,13 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);

         addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU32));

         if (instData.IMM) {
             offset = extData.OFFSET;
@@ -5093,10 +5098,6 @@
     void
     Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-        sdata.read();
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU32));
         initMemWrite<1>(gpuDynInst);
     } // initiateAcc

@@ -5127,8 +5128,13 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);

         addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU64));

         if (instData.IMM) {
             offset = extData.OFFSET;
@@ -5152,10 +5158,6 @@
     void
     Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-        sdata.read();
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU64));
         initMemWrite<2>(gpuDynInst);
     } // initiateAcc

@@ -5186,8 +5188,13 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);

         addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            4 * sizeof(ScalarRegU32));

         if (instData.IMM) {
             offset = extData.OFFSET;
@@ -5211,10 +5218,6 @@
     void
     Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
-        sdata.read();
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            4 * sizeof(ScalarRegU32));
         initMemWrite<4>(gpuDynInst);
     } // initiateAcc

@@ -35746,9 +35749,18 @@
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);

         rsrcDesc.read();
         offset.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }

         int inst_offset = instData.OFFSET;

@@ -35793,16 +35805,6 @@
     void
     Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemI8>(gpuDynInst);
     } // initiateAcc

@@ -35842,9 +35844,18 @@
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);

         rsrcDesc.read();
         offset.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }

         int inst_offset = instData.OFFSET;

@@ -35889,16 +35900,6 @@
     void
     Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemI16>(gpuDynInst);
     } // initiateAcc

@@ -35938,12 +35939,21 @@
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);

         rsrcDesc.read();
         offset.read();
+        data.read();

         int inst_offset = instData.OFFSET;

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
@@ -35985,16 +35995,6 @@
     void
     Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemU32>(gpuDynInst);
     } // initiateAcc

@@ -39998,11 +39998,20 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA);

         addr.read();
+        data.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40019,16 +40028,6 @@
     void
     Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemU8>(gpuDynInst);
     } // initiateAcc

@@ -40068,11 +40067,20 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA);

         addr.read();
+        data.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40089,17 +40097,6 @@
     void
     Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemU16>(gpuDynInst);
     } // initiateAcc

@@ -40139,11 +40136,20 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         addr.read();
+        data.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40160,16 +40166,6 @@
     void
     Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemU32>(gpuDynInst);
     } // initiateAcc

@@ -40210,11 +40206,20 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA);

         addr.read();
+        data.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40231,16 +40236,6 @@
     void
     Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
         initMemWrite<VecElemU64>(gpuDynInst);
     } // initiateAcc

@@ -40281,11 +40276,28 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);

         addr.read();
+        data0.read();
+        data1.read();
+        data2.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40302,25 +40314,6 @@
     void
     Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-
-        data0.read();
-        data1.read();
-        data2.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
-            }
-        }
-
         initMemWrite<3>(gpuDynInst);
     } // initiateAcc

@@ -40361,11 +40354,32 @@
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);

         addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();

         calcAddr(gpuDynInst, addr);

+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
@@ -40382,29 +40396,6 @@
     void
     Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
-
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
         initMemWrite<4>(gpuDynInst);
     } // initiateAcc




3 is the latest approved patch-set.
No files were changed between the latest approved patch-set and the submitted one.
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/45345
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Iee107c19e4b82c2e172bf2d6cc95b79983a43d83
Gerrit-Change-Number: 45345
Gerrit-PatchSet: 5
Gerrit-Owner: Kyle Roarty <kyleroarty1...@gmail.com>
Gerrit-Reviewer: Alex Dutu <alexandru.d...@amd.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to