changeset b243dc8cec8b in /z/repo/m5 details: http://repo.m5sim.org/m5?cmd=changeset;node=b243dc8cec8b description: X86: Don't read in dest regs if all bits are replaced.
In x86, 32 and 64 bit writes to registers in which registers appear to be 32 or 64 bits wide overwrite all bits of the destination register. This change removes false dependencies in these cases where the previous value of a register doesn't need to be read to write a new value. New versions of most microops are created that have a "Big" suffix which simply overwrite their destination, and the right version to use is selected during microop allocation based on the selected data size. This does not change the performance of the O3 CPU model significantly, I assume because there are other false dependencies from the condition code bits in the flags register. diffstat: src/arch/x86/isa/microops/ldstop.isa | 78 ++++++++- src/arch/x86/isa/microops/limmop.isa | 27 ++- src/arch/x86/isa/microops/regop.isa | 269 +++++++++++++++++++++++----------- 3 files changed, 266 insertions(+), 108 deletions(-) diffs (truncated from 765 to 300 lines): diff -r 0dff1ff293d0 -r b243dc8cec8b src/arch/x86/isa/microops/ldstop.isa --- a/src/arch/x86/isa/microops/ldstop.isa Sun Feb 13 17:42:56 2011 -0800 +++ b/src/arch/x86/isa/microops/ldstop.isa Sun Feb 13 17:44:24 2011 -0800 @@ -301,6 +301,46 @@ "dataSize" : self.dataSize, "addressSize" : self.addressSize, "memFlags" : self.memFlags} return allocator + + class BigLdStOp(X86Microop): + def __init__(self, data, segment, addr, disp, + dataSize, addressSize, baseFlags, atCPL0, prefetch): + self.data = data + [self.scale, self.index, self.base] = addr + self.disp = disp + self.segment = segment + self.dataSize = dataSize + self.addressSize = addressSize + self.memFlags = baseFlags + if atCPL0: + self.memFlags += " | (CPL0FlagBit << FlagShift)" + if prefetch: + self.memFlags += " | Request::PREFETCH" + self.memFlags += " | (machInst.legacy.addr ? " + \ + "(AddrSizeFlagBit << FlagShift) : 0)" + + def getAllocator(self, microFlags): + allocString = ''' + (%(dataSize)s >= 4) ? + (StaticInstPtr)(new %(class_name)sBig(machInst, + macrocodeBlock, %(flags)s, %(scale)s, %(index)s, + %(base)s, %(disp)s, %(segment)s, %(data)s, + %(dataSize)s, %(addressSize)s, %(memFlags)s)) : + (StaticInstPtr)(new %(class_name)s(machInst, + macrocodeBlock, %(flags)s, %(scale)s, %(index)s, + %(base)s, %(disp)s, %(segment)s, %(data)s, + %(dataSize)s, %(addressSize)s, %(memFlags)s)) + ''' + allocator = allocString % { + "class_name" : self.className, + "flags" : self.microFlagsText(microFlags), + "scale" : self.scale, "index" : self.index, + "base" : self.base, + "disp" : self.disp, + "segment" : self.segment, "data" : self.data, + "dataSize" : self.dataSize, "addressSize" : self.addressSize, + "memFlags" : self.memFlags} + return allocator }}; let {{ @@ -315,7 +355,8 @@ EA = bits(SegBase + scale * Index + Base + disp, addressSize * 8 - 1, 0); ''' - def defineMicroLoadOp(mnemonic, code, mem_flags="0"): + def defineMicroLoadOp(mnemonic, code, bigCode='', + mem_flags="0", big=True): global header_output global decoder_output global exec_output @@ -324,16 +365,22 @@ name = mnemonic.lower() # Build up the all register version of this micro op - iop = InstObjParams(name, Name, 'X86ISA::LdStOp', - {"code": code, - "ea_code": calculateEA}) - header_output += MicroLdStOpDeclare.subst(iop) - decoder_output += MicroLdStOpConstructor.subst(iop) - exec_output += MicroLoadExecute.subst(iop) - exec_output += MicroLoadInitiateAcc.subst(iop) - exec_output += MicroLoadCompleteAcc.subst(iop) + iops = [InstObjParams(name, Name, 'X86ISA::LdStOp', + {"code": code, "ea_code": calculateEA})] + if big: + iops += [InstObjParams(name, Name + "Big", 'X86ISA::LdStOp', + {"code": bigCode, "ea_code": calculateEA})] + for iop in iops: + header_output += MicroLdStOpDeclare.subst(iop) + decoder_output += MicroLdStOpConstructor.subst(iop) + exec_output += MicroLoadExecute.subst(iop) + exec_output += MicroLoadInitiateAcc.subst(iop) + exec_output += MicroLoadCompleteAcc.subst(iop) - class LoadOp(LdStOp): + base = LdStOp + if big: + base = BigLdStOp + class LoadOp(base): def __init__(self, data, segment, addr, disp = 0, dataSize="env.dataSize", addressSize="env.addressSize", @@ -346,12 +393,15 @@ microopClasses[name] = LoadOp - defineMicroLoadOp('Ld', 'Data = merge(Data, Mem, dataSize);') + defineMicroLoadOp('Ld', 'Data = merge(Data, Mem, dataSize);', + 'Data = Mem & mask(dataSize * 8);') defineMicroLoadOp('Ldst', 'Data = merge(Data, Mem, dataSize);', - '(StoreCheck << FlagShift)') + 'Data = Mem & mask(dataSize * 8);', + '(StoreCheck << FlagShift)') defineMicroLoadOp('Ldstl', 'Data = merge(Data, Mem, dataSize);', - '(StoreCheck << FlagShift) | Request::LOCKED') - defineMicroLoadOp('Ldfp', 'FpData.uqw = Mem;') + 'Data = Mem & mask(dataSize * 8);', + '(StoreCheck << FlagShift) | Request::LOCKED') + defineMicroLoadOp('Ldfp', 'FpData.uqw = Mem;', big = False) def defineMicroStoreOp(mnemonic, code, \ postCode="", completeCode="", mem_flags="0"): diff -r 0dff1ff293d0 -r b243dc8cec8b src/arch/x86/isa/microops/limmop.isa --- a/src/arch/x86/isa/microops/limmop.isa Sun Feb 13 17:42:56 2011 -0800 +++ b/src/arch/x86/isa/microops/limmop.isa Sun Feb 13 17:44:24 2011 -0800 @@ -114,8 +114,16 @@ self.dataSize = dataSize def getAllocator(self, microFlags): - allocator = '''new %(class_name)s(machInst, macrocodeBlock, - %(flags)s, %(dest)s, %(imm)s, %(dataSize)s)''' % { + allocString = ''' + (%(dataSize)s >= 4) ? + (StaticInstPtr)(new %(class_name)sBig(machInst, + macrocodeBlock, %(flags)s, %(dest)s, %(imm)s, + %(dataSize)s)) : + (StaticInstPtr)(new %(class_name)s(machInst, + macrocodeBlock, %(flags)s, %(dest)s, %(imm)s, + %(dataSize)s)) + ''' + allocator = allocString % { "class_name" : self.className, "mnemonic" : self.mnemonic, "flags" : self.microFlagsText(microFlags), @@ -152,12 +160,15 @@ let {{ # Build up the all register version of this micro op - iop = InstObjParams("limm", "Limm", 'X86MicroopBase', - {"code" : "DestReg = merge(DestReg, imm, dataSize);"}) - header_output += MicroLimmOpDeclare.subst(iop) - decoder_output += MicroLimmOpConstructor.subst(iop) - decoder_output += MicroLimmOpDisassembly.subst(iop) - exec_output += MicroLimmOpExecute.subst(iop) + iops = [InstObjParams("limm", "Limm", 'X86MicroopBase', + {"code" : "DestReg = merge(DestReg, imm, dataSize);"}), + InstObjParams("limm", "LimmBig", 'X86MicroopBase', + {"code" : "DestReg = imm & mask(dataSize * 8);"})] + for iop in iops: + header_output += MicroLimmOpDeclare.subst(iop) + decoder_output += MicroLimmOpConstructor.subst(iop) + decoder_output += MicroLimmOpDisassembly.subst(iop) + exec_output += MicroLimmOpExecute.subst(iop) iop = InstObjParams("lfpimm", "Lfpimm", 'X86MicroopBase', {"code" : "FpDestReg.uqw = imm"}) diff -r 0dff1ff293d0 -r b243dc8cec8b src/arch/x86/isa/microops/regop.isa --- a/src/arch/x86/isa/microops/regop.isa Sun Feb 13 17:42:56 2011 -0800 +++ b/src/arch/x86/isa/microops/regop.isa Sun Feb 13 17:44:24 2011 -0800 @@ -224,8 +224,8 @@ MicroRegOpExecute) class RegOpMeta(type): - def buildCppClasses(self, name, Name, suffix, \ - code, flag_code, cond_check, else_code, cond_control_flag_init): + def buildCppClasses(self, name, Name, suffix, code, big_code, \ + flag_code, cond_check, else_code, cond_control_flag_init): # Globals to stick the output in global header_output @@ -235,11 +235,13 @@ # Stick all the code together so it can be searched at once allCode = "|".join((code, flag_code, cond_check, else_code, cond_control_flag_init)) + allBigCode = "|".join((big_code, flag_code, cond_check, else_code, + cond_control_flag_init)) # If op2 is used anywhere, make register and immediate versions # of this code. matcher = re.compile("(?<!\\w)(?P<prefix>s?)op2(?P<typeQual>\\.\\w+)?") - match = matcher.search(allCode) + match = matcher.search(allCode + allBigCode) if match: typeQual = "" if match.group("typeQual"): @@ -247,6 +249,7 @@ src2_name = "%spsrc2%s" % (match.group("prefix"), typeQual) self.buildCppClasses(name, Name, suffix, matcher.sub(src2_name, code), + matcher.sub(src2_name, big_code), matcher.sub(src2_name, flag_code), matcher.sub(src2_name, cond_check), matcher.sub(src2_name, else_code), @@ -254,6 +257,7 @@ imm_name = "%simm8" % match.group("prefix") self.buildCppClasses(name + "i", Name, suffix + "Imm", matcher.sub(imm_name, code), + matcher.sub(imm_name, big_code), matcher.sub(imm_name, flag_code), matcher.sub(imm_name, cond_check), matcher.sub(imm_name, else_code), @@ -264,27 +268,32 @@ # a version without it and fix up this version to use it. if flag_code != "" or cond_check != "true": self.buildCppClasses(name, Name, suffix, - code, "", "true", else_code, "") + code, big_code, "", "true", else_code, "") suffix = "Flags" + suffix # If psrc1 or psrc2 is used, we need to actually insert code to # compute it. - matcher = re.compile("(?<!\w)psrc1(?!\w)") - if matcher.search(allCode): - code = "uint64_t psrc1 = pick(SrcReg1, 0, dataSize);" + code - matcher = re.compile("(?<!\w)psrc2(?!\w)") - if matcher.search(allCode): - code = "uint64_t psrc2 = pick(SrcReg2, 1, dataSize);" + code - # Also make available versions which do sign extension - matcher = re.compile("(?<!\w)spsrc1(?!\w)") - if matcher.search(allCode): - code = "int64_t spsrc1 = signedPick(SrcReg1, 0, dataSize);" + code - matcher = re.compile("(?<!\w)spsrc2(?!\w)") - if matcher.search(allCode): - code = "int64_t spsrc2 = signedPick(SrcReg2, 1, dataSize);" + code - matcher = re.compile("(?<!\w)simm8(?!\w)") - if matcher.search(allCode): - code = "int8_t simm8 = imm8;" + code + for (big, all) in ((False, allCode), (True, allBigCode)): + prefix = "" + for (rex, decl) in ( + ("(?<!\w)psrc1(?!\w)", + "uint64_t psrc1 = pick(SrcReg1, 0, dataSize);"), + ("(?<!\w)psrc2(?!\w)", + "uint64_t psrc2 = pick(SrcReg2, 1, dataSize);"), + ("(?<!\w)spsrc1(?!\w)", + "int64_t spsrc1 = signedPick(SrcReg1, 0, dataSize);"), + ("(?<!\w)spsrc2(?!\w)", + "int64_t spsrc2 = signedPick(SrcReg2, 1, dataSize);"), + ("(?<!\w)simm8(?!\w)", + "int8_t simm8 = imm8;")): + matcher = re.compile(rex) + if matcher.search(all): + prefix += decl + "\n" + if big: + if big_code != "": + big_code = prefix + big_code + else: + code = prefix + code base = "X86ISA::RegOp" @@ -297,17 +306,26 @@ templates = immTemplates # Get everything ready for the substitution - iop = InstObjParams(name, Name + suffix, base, + iops = [InstObjParams(name, Name + suffix, base, {"code" : code, "flag_code" : flag_code, "cond_check" : cond_check, "else_code" : else_code, - "cond_control_flag_init": cond_control_flag_init}) + "cond_control_flag_init" : cond_control_flag_init})] + if big_code != "": + iops += [InstObjParams(name, Name + suffix + "Big", base, + {"code" : big_code, + "flag_code" : flag_code, + "cond_check" : cond_check, + "else_code" : else_code, + "cond_control_flag_init" : + cond_control_flag_init})] # Generate the actual code (finally!) - header_output += templates[0].subst(iop) - decoder_output += templates[1].subst(iop) - exec_output += templates[2].subst(iop) + for iop in iops: + header_output += templates[0].subst(iop) + decoder_output += templates[1].subst(iop) + exec_output += templates[2].subst(iop) def __new__(mcls, Name, bases, dict): @@ -322,14 +340,16 @@ cls.className = Name cls.base_mnemonic = name code = cls.code + big_code = cls.big_code flag_code = cls.flag_code cond_check = cls.cond_check else_code = cls.else_code cond_control_flag_init = cls.cond_control_flag_init # Set up the C++ classes - mcls.buildCppClasses(cls, name, Name, "", code, flag_code, - cond_check, else_code, cond_control_flag_init) + mcls.buildCppClasses(cls, name, Name, "", code, big_code, + flag_code, cond_check, else_code, + cond_control_flag_init) _______________________________________________ m5-dev mailing list m5-dev@m5sim.org http://m5sim.org/mailman/listinfo/m5-dev