changeset 270c9a75e91f in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=270c9a75e91f
description:
        X86: Move address based decode caching in front of the predecoder.
        The predecoder in x86 does a lot of work, most of which can be skipped 
if the
        decoder cache is put in front of it.

        Committed by: Nilay Vaish <[email protected]>

diffstat:

 src/arch/x86/decoder.cc |  157 +++++++++++++++++++++++++++++++++++++----------
 src/arch/x86/decoder.hh |  135 ++++++++++++++++++++++++++++++++---------
 src/arch/x86/isa.cc     |   21 ++++-
 src/arch/x86/isa.hh     |    3 +-
 4 files changed, 245 insertions(+), 71 deletions(-)

diffs (truncated from 587 to 300 lines):

diff -r ecfd5607d5e9 -r 270c9a75e91f src/arch/x86/decoder.cc
--- a/src/arch/x86/decoder.cc   Fri Jan 04 18:09:45 2013 -0600
+++ b/src/arch/x86/decoder.cc   Fri Jan 04 19:00:44 2013 -0600
@@ -38,10 +38,15 @@
 
 namespace X86ISA
 {
-void Decoder::doReset()
+
+Decoder::State
+Decoder::doResetState()
 {
     origPC = basePC + offset;
     DPRINTF(Decoder, "Setting origPC to %#x\n", origPC);
+    instBytes = &decodePages->lookup(origPC);
+    chunkIdx = 0;
+
     emi.rex = 0;
     emi.legacy = 0;
     emi.opcode.num = 0;
@@ -55,12 +60,17 @@
 
     emi.modRM = 0;
     emi.sib = 0;
-    m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
-    emi.mode.mode = m5Reg.mode;
-    emi.mode.submode = m5Reg.submode;
+
+    if (instBytes->si) {
+        return FromCacheState;
+    } else {
+        instBytes->chunks.clear();
+        return PrefixState;
+    }
 }
 
-void Decoder::process()
+void
+Decoder::process()
 {
     //This function drives the decoder state machine.
 
@@ -70,15 +80,18 @@
     assert(!outOfBytes);
     assert(!instDone);
 
+    if (state == ResetState)
+        state = doResetState();
+    if (state == FromCacheState) {
+        state = doFromCacheState();
+    } else {
+        instBytes->chunks.push_back(fetchChunk);
+    }
+
     //While there's still something to do...
-    while(!instDone && !outOfBytes)
-    {
+    while (!instDone && !outOfBytes) {
         uint8_t nextByte = getNextByte();
-        switch(state)
-        {
-          case ResetState:
-            doReset();
-            state = PrefixState;
+        switch (state) {
           case PrefixState:
             state = doPrefixState(nextByte);
             break;
@@ -105,9 +118,42 @@
     }
 }
 
+Decoder::State
+Decoder::doFromCacheState()
+{
+    DPRINTF(Decoder, "Looking at cache state.\n");
+    if ((fetchChunk & instBytes->masks[chunkIdx]) !=
+            instBytes->chunks[chunkIdx]) {
+        DPRINTF(Decoder, "Decode cache miss.\n");
+        // The chached chunks didn't match what was fetched. Fall back to the
+        // predecoder.
+        instBytes->chunks[chunkIdx] = fetchChunk;
+        instBytes->chunks.resize(chunkIdx + 1);
+        instBytes->si = NULL;
+        chunkIdx = 0;
+        fetchChunk = instBytes->chunks[0];
+        offset = origPC % sizeof(MachInst);
+        basePC = origPC - offset;
+        return PrefixState;
+    } else if (chunkIdx == instBytes->chunks.size() - 1) {
+        // We matched the cache, so use its value.
+        instDone = true;
+        offset = instBytes->lastOffset;
+        if (offset == sizeof(MachInst))
+            outOfBytes = true;
+        return ResetState;
+    } else {
+        // We matched so far, but need to check more chunks.
+        chunkIdx++;
+        outOfBytes = true;
+        return FromCacheState;
+    }
+}
+
 //Either get a prefix and record it in the ExtMachInst, or send the
 //state machine on to get the opcode(s).
-Decoder::State Decoder::doPrefixState(uint8_t nextByte)
+Decoder::State
+Decoder::doPrefixState(uint8_t nextByte)
 {
     uint8_t prefix = Prefixes[nextByte];
     State nextState = PrefixState;
@@ -164,7 +210,8 @@
 
 //Load all the opcodes (currently up to 2) and then figure out
 //what immediate and/or ModRM is needed.
-Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
+Decoder::State
+Decoder::doOpcodeState(uint8_t nextByte)
 {
     State nextState = ErrorState;
     emi.opcode.num++;
@@ -194,9 +241,9 @@
         if (emi.rex.w)
             logOpSize = 3; // 64 bit operand size
         else if (emi.legacy.op)
-            logOpSize = m5Reg.altOp;
+            logOpSize = altOp;
         else
-            logOpSize = m5Reg.defOp;
+            logOpSize = defOp;
 
         //Set the actual op size
         emi.opSize = 1 << logOpSize;
@@ -205,16 +252,16 @@
         //a fixed value at the decoder level.
         int logAddrSize;
         if(emi.legacy.addr)
-            logAddrSize = m5Reg.altAddr;
+            logAddrSize = altAddr;
         else
-            logAddrSize = m5Reg.defAddr;
+            logAddrSize = defAddr;
 
         //Set the actual address size
         emi.addrSize = 1 << logAddrSize;
 
         //Figure out the effective stack width. This can be overriden to
         //a fixed value at the decoder level.
-        emi.stackSize = 1 << m5Reg.stack;
+        emi.stackSize = 1 << stack;
 
         //Figure out how big of an immediate we'll retreive based
         //on the opcode.
@@ -242,13 +289,14 @@
 //Get the ModRM byte and determine what displacement, if any, there is.
 //Also determine whether or not to get the SIB byte, displacement, or
 //immediate next.
-Decoder::State Decoder::doModRMState(uint8_t nextByte)
+Decoder::State
+Decoder::doModRMState(uint8_t nextByte)
 {
     State nextState = ErrorState;
     ModRM modRM;
     modRM = nextByte;
     DPRINTF(Decoder, "Found modrm byte %#x.\n", nextByte);
-    if (m5Reg.defOp == 1) {
+    if (defOp == 1) {
         //figure out 16 bit displacement size
         if ((modRM.mod == 0 && modRM.rm == 6) || modRM.mod == 2)
             displacementSize = 2;
@@ -297,7 +345,8 @@
 //Get the SIB byte. We don't do anything with it at this point, other
 //than storing it in the ExtMachInst. Determine if we need to get a
 //displacement or immediate next.
-Decoder::State Decoder::doSIBState(uint8_t nextByte)
+Decoder::State
+Decoder::doSIBState(uint8_t nextByte)
 {
     State nextState = ErrorState;
     emi.sib = nextByte;
@@ -318,7 +367,8 @@
 
 //Gather up the displacement, or at least as much of it
 //as we can get.
-Decoder::State Decoder::doDisplacementState()
+Decoder::State
+Decoder::doDisplacementState()
 {
     State nextState = ErrorState;
 
@@ -365,7 +415,8 @@
 
 //Gather up the immediate, or at least as much of it
 //as we can get
-Decoder::State Decoder::doImmediateState()
+Decoder::State
+Decoder::doImmediateState()
 {
     State nextState = ErrorState;
 
@@ -408,24 +459,62 @@
     return nextState;
 }
 
-DecodeCache::InstMap Decoder::instMap;
-DecodeCache::AddrMap<StaticInstPtr> Decoder::decodePages;
+Decoder::InstBytes Decoder::dummy;
+Decoder::InstCacheMap Decoder::instCacheMap;
 
 StaticInstPtr
 Decoder::decode(ExtMachInst mach_inst, Addr addr)
 {
-    StaticInstPtr &si = decodePages.lookup(addr);
-    if (si && (si->machInst == mach_inst))
+    DecodeCache::InstMap::iterator iter = instMap->find(mach_inst);
+    if (iter != instMap->end())
+        return iter->second;
+
+    StaticInstPtr si = decodeInst(mach_inst);
+    (*instMap)[mach_inst] = si;
+    return si;
+}
+
+StaticInstPtr
+Decoder::decode(PCState &nextPC)
+{
+    if (!instDone)
+        return NULL;
+    instDone = false;
+    updateNPC(nextPC);
+
+    StaticInstPtr &si = instBytes->si;
+    if (si)
         return si;
 
-    DecodeCache::InstMap::iterator iter = instMap.find(mach_inst);
-    if (iter != instMap.end()) {
-        si = iter->second;
-        return si;
+    // We didn't match in the AddrMap, but we still populated an entry. Fix
+    // up its byte masks.
+    const int chunkSize = sizeof(MachInst);
+
+    instBytes->lastOffset = offset;
+
+    Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
+    Addr firstOffset = origPC - firstBasePC;
+    Addr totalSize = instBytes->lastOffset - firstOffset +
+        (instBytes->chunks.size() - 1) * chunkSize;
+    int start = firstOffset;
+    instBytes->masks.clear();
+
+    while (totalSize) {
+        int end = start + totalSize;
+        end = (chunkSize < end) ? chunkSize : end;
+        int size = end - start;
+        int idx = instBytes->masks.size();
+
+        MachInst maskVal = mask(size * 8) << (start * 8);
+        assert(maskVal);
+
+        instBytes->masks.push_back(maskVal);
+        instBytes->chunks[idx] &= instBytes->masks[idx];
+        totalSize -= size;
+        start = 0;
     }
 
-    si = decodeInst(mach_inst);
-    instMap[mach_inst] = si;
+    si = decode(emi, origPC);
     return si;
 }
 
diff -r ecfd5607d5e9 -r 270c9a75e91f src/arch/x86/decoder.hh
--- a/src/arch/x86/decoder.hh   Fri Jan 04 18:09:45 2013 -0600
+++ b/src/arch/x86/decoder.hh   Fri Jan 04 19:00:44 2013 -0600
@@ -32,6 +32,7 @@
 #define __ARCH_X86_DECODER_HH__
 
 #include <cassert>
+#include <vector>
 
 #include "arch/x86/regs/misc.hh"
 #include "arch/x86/types.hh"
@@ -58,9 +59,24 @@
     static const uint8_t SizeTypeToSize[3][10];
 
   protected:
+    struct InstBytes
+    {
+        StaticInstPtr si;
+        std::vector<MachInst> chunks;
+        std::vector<MachInst> masks;
+        int lastOffset;
+
+        InstBytes() : lastOffset(0)
+        {}
+    };
+
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to