changeset 53278be85b40 in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=53278be85b40
description:
        arm: Fix v8 neon latency issue for loads/stores

        Neon memory ops that operate on multiple registers currently have very 
poor
        performance because of interleave/deinterleave micro-ops.

        This patch marks the deinterleave/interleave micro-ops as "No_OpClass" 
such
        that they take minumum cycles to execute and are never resource 
constrained.

        Additionaly the micro-ops over-read registers.  Although one form may 
need
        to read up to 20 sources, not all do.  This adds in new forms so false
        dependencies are not modeled.  Instructions read their minimum number of
        sources.

diffstat:

 src/arch/arm/insts/macromem.cc        |  47 +++++++++++++++++++++++++++++-----
 src/arch/arm/isa/insts/neon64_mem.isa |  24 +++++++++++-----
 2 files changed, 56 insertions(+), 15 deletions(-)

diffs (140 lines):

diff -r 8bee5f4edb92 -r 53278be85b40 src/arch/arm/insts/macromem.cc
--- a/src/arch/arm/insts/macromem.cc    Tue Apr 29 16:05:02 2014 -0500
+++ b/src/arch/arm/insts/macromem.cc    Wed Sep 03 07:42:44 2014 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013 ARM Limited
+ * Copyright (c) 2010-2014 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -1107,9 +1107,26 @@
     }
 
     for (int i = 0; i < numMarshalMicroops; ++i) {
-        microOps[uopIdx++] = new MicroDeintNeon64(
-            machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
-            numStructElems, numRegs, i /* step */);
+        switch(numRegs) {
+            case 1: microOps[uopIdx++] = new MicroDeintNeon64_1Reg(
+                        machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+                        numStructElems, 1, i /* step */);
+                    break;
+            case 2: microOps[uopIdx++] = new MicroDeintNeon64_2Reg(
+                        machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+                        numStructElems, 2, i /* step */);
+                    break;
+            case 3: microOps[uopIdx++] = new MicroDeintNeon64_3Reg(
+                        machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+                        numStructElems, 3, i /* step */);
+                    break;
+            case 4: microOps[uopIdx++] = new MicroDeintNeon64_4Reg(
+                        machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+                        numStructElems, 4, i /* step */);
+                    break;
+            default: panic("Invalid number of registers");
+        }
+
     }
 
     assert(uopIdx == numMicroops);
@@ -1150,9 +1167,25 @@
     unsigned uopIdx = 0;
 
     for(int i = 0; i < numMarshalMicroops; ++i) {
-        microOps[uopIdx++] = new MicroIntNeon64(
-            machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
-            numStructElems, numRegs, i /* step */);
+        switch (numRegs) {
+            case 1: microOps[uopIdx++] = new MicroIntNeon64_1Reg(
+                        machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+                        numStructElems, 1, i /* step */);
+                    break;
+            case 2: microOps[uopIdx++] = new MicroIntNeon64_2Reg(
+                        machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+                        numStructElems, 2, i /* step */);
+                    break;
+            case 3: microOps[uopIdx++] = new MicroIntNeon64_3Reg(
+                        machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+                        numStructElems, 3, i /* step */);
+                    break;
+            case 4: microOps[uopIdx++] = new MicroIntNeon64_4Reg(
+                        machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+                        numStructElems, 4, i /* step */);
+                    break;
+            default: panic("Invalid number of registers");
+        }
     }
 
     uint32_t memaccessFlags = TLB::MustBeOne | (TLB::ArmFlags) eSize |
diff -r 8bee5f4edb92 -r 53278be85b40 src/arch/arm/isa/insts/neon64_mem.isa
--- a/src/arch/arm/isa/insts/neon64_mem.isa     Tue Apr 29 16:05:02 2014 -0500
+++ b/src/arch/arm/isa/insts/neon64_mem.isa     Wed Sep 03 07:42:44 2014 -0400
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 
-// Copyright (c) 2012-2013 ARM Limited
+// Copyright (c) 2012-2014 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -163,11 +163,11 @@
         header_output += MicroNeonMemDeclare64.subst(loadIop) + \
             MicroNeonMemDeclare64.subst(storeIop)
 
-    def mkMarshalMicroOp(name, Name):
+    def mkMarshalMicroOp(name, Name, numRegs=4):
         global header_output, decoder_output, exec_output
 
         getInputCodeOp1L = ''
-        for v in range(4):
+        for v in range(numRegs):
             for p in range(4):
                 getInputCodeOp1L += '''
             writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
@@ -175,7 +175,7 @@
             ''' % { 'v' : v, 'p' : p }
 
         getInputCodeOp1S = ''
-        for v in range(4):
+        for v in range(numRegs):
             for p in range(4):
                 getInputCodeOp1S += '''
             writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
@@ -262,7 +262,8 @@
             '''
 
             iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
-                                { 'code' : eCode }, ['IsMicroop'])
+                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
+                                ['IsMicroop'])
             header_output += MicroNeonMixDeclare64.subst(iop)
             exec_output += MicroNeonMixExecute64.subst(iop)
 
@@ -323,7 +324,8 @@
                 ''' % { 'v': v, 'p': p}
 
             iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
-                                { 'code' : eCode }, ['IsMicroop'])
+                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
+                                ['IsMicroop'])
             header_output += MicroNeonMixDeclare64.subst(iop)
             exec_output += MicroNeonMixExecute64.subst(iop)
 
@@ -443,8 +445,14 @@
 
     # Generate instructions
     mkMemAccMicroOp('mem_neon_uop')
-    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64')
-    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64')
+    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
+    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
+    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
+    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
+    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
+    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
+    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
+    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
     mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
     mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
 
_______________________________________________
gem5-dev mailing list
gem5-dev@gem5.org
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to