Author: Armin Rigo <[email protected]>
Branch: reusing-r11
Changeset: r90965:d8f5a5347abb
Date: 2017-04-05 15:29 +0200
http://bitbucket.org/pypy/pypy/changeset/d8f5a5347abb/
Log: in-progress
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -277,7 +277,7 @@
#
mc.TEST_rr(eax.value, eax.value)
mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
- jz_location = mc.get_relative_pos()
+ jz_location = mc.get_relative_pos(break_basic_block=False)
mc.MOV_rr(ecx.value, eax.value)
#
nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
@@ -718,6 +718,7 @@
if rx86.fits_in_32bits(offset):
mc.JMP_l(offset)
else:
+ # mc.forget_scratch_register() not needed here
mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
mc.JMP_r(X86_64_SCRATCH_REG.value)
mc.copy_to_raw_memory(adr_jump_offset)
@@ -830,10 +831,10 @@
descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit
- stack_check_cmp_ofs = mc.get_relative_pos() - 4
+ stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
jg_location = mc.emit_forward_jump('GE')
mc.MOV_si(WORD, 0xffffff) # force writing 32 bit
- ofs2 = mc.get_relative_pos() - 4
+ ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
self.push_gcmap(mc, gcmap, store=True)
mc.CALL(imm(self._frame_realloc_slowpath))
# patch the JG above
@@ -850,11 +851,11 @@
descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
mc.CMP_bi(ofs, 0xffffff)
- stack_check_cmp_ofs = mc.get_relative_pos() - 4
+ stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
jg_location = mc.emit_forward_jump('GE')
mc.MOV_rr(edi.value, ebp.value)
mc.MOV_ri(esi.value, 0xffffff)
- ofs2 = mc.get_relative_pos() - 4
+ ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
mc.CALL(imm(self.cpu.realloc_frame_crash))
# patch the JG above
mc.patch_forward_jump(jg_location)
@@ -895,6 +896,7 @@
# "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
# because we always write "mov r11, imm-as-8-bytes; call *r11" in
# the first place.
+ # mc.forget_scratch_register() not needed here
mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
mc.JMP_r(X86_64_SCRATCH_REG.value)
p = rffi.cast(rffi.INTP, adr_jump_offset)
@@ -939,7 +941,7 @@
# would be used to pass arguments #3 and #4 (even though, so
# far, the assembler only receives two arguments).
tloc = esi
- old = r11
+ old = r10
# eax = address in the stack of a 3-words struct vmprof_stack_s
self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD)
# old = current value of vmprof_tl_stack
@@ -1023,27 +1025,14 @@
fit in 32 bits, it will be loaded in r11.
"""
rst = gcrootmap.get_root_stack_top_addr()
- if rx86.fits_in_32bits(rst):
- mc.MOV_rj(ebx.value, rst) # MOV ebx, [rootstacktop]
- else:
- mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop
- mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0))
- # MOV ebx, [r11]
- #
+ mc.MOV(ebx, heap(rst)) # maybe via loading r11
return rst
def _call_header_shadowstack(self, gcrootmap):
rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap)
self.mc.MOV_mr((ebx.value, 0), ebp.value) # MOV [ebx], ebp
self.mc.ADD_ri(ebx.value, WORD)
- if rx86.fits_in_32bits(rst):
- self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
- else:
- # The integer 'rst' doesn't fit in 32 bits, so we know that
- # _load_shadowstack_top_in_ebx() above loaded it in r11.
- # Reuse it. Be careful not to overwrite r11 in the middle!
- self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0),
- ebx.value) # MOV [r11], ebx
+ self.mc.MOV(heap(rst), ebx) # MOV [rootstacktop], ebx
def _call_footer_shadowstack(self, gcrootmap):
rst = gcrootmap.get_root_stack_top_addr()
@@ -1449,7 +1438,7 @@
# has been emitted. 64-bit mode only.
assert IS_X86_64
address_in_buffer = index * WORD # at the start of the buffer
- p_location = self.mc.get_relative_pos()
+ p_location = self.mc.get_relative_pos(break_basic_block=False)
offset = address_in_buffer - p_location
self.mc.overwrite32(p_location-4, offset)
@@ -1551,7 +1540,7 @@
self.mc.add_pending_relocation()
elif WORD == 8:
self.mc.J_il(rx86.Conditions['Z'], 0)
- pos = self.mc.get_relative_pos()
+ pos = self.mc.get_relative_pos(break_basic_block=False)
self.pending_memoryerror_trampoline_from.append(pos)
# ----------
@@ -1721,7 +1710,8 @@
def genop_guard_guard_not_invalidated(self, guard_op, guard_token,
locs, ign):
- pos = self.mc.get_relative_pos() + 1 # after potential jmp
+ pos = self.mc.get_relative_pos(break_basic_block=False)
+ pos += 1 # after potential jmp
guard_token.pos_jump_offset = pos
self.pending_guard_tokens.append(guard_token)
@@ -2077,7 +2067,8 @@
assert self.guard_success_cc >= 0
self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0)
self.guard_success_cc = rx86.cond_none
- guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4
+ pos = self.mc.get_relative_pos(break_basic_block=False)
+ guard_token.pos_jump_offset = pos - 4
self.pending_guard_tokens.append(guard_token)
def _genop_real_call(self, op, arglocs, resloc):
@@ -2125,6 +2116,7 @@
faildescrindex = self.get_gcref_from_faildescr(faildescr)
if IS_X86_64:
+ self.mc.forget_scratch_register()
self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0)
self._patch_load_from_gc_table(faildescrindex)
self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG)
@@ -2313,6 +2305,7 @@
if IS_X86_64 and isinstance(loc_base, RegLoc):
# copy loc_index into r11
tmp1 = X86_64_SCRATCH_REG
+ mc.forget_scratch_register()
mc.MOV_rr(tmp1.value, loc_index.value)
final_pop = False
else:
@@ -2325,7 +2318,13 @@
# XOR tmp, -8
mc.XOR_ri(tmp1.value, -8)
# BTS [loc_base], tmp
- mc.BTS(addr_add_const(loc_base, 0), tmp1)
+ if final_pop:
+ # r11 is not specially used, fall back to regloc.py
+ mc.BTS(addr_add_const(loc_base, 0), tmp1)
+ else:
+ # tmp1 is r11! but in this case, loc_base is a
+ # register so we can invoke directly rx86.py
+ mc.BTS_mr((loc_base.value, 0), tmp1.value)
# done
if final_pop:
mc.POP_r(loc_index.value)
diff --git a/rpython/jit/backend/x86/callbuilder.py
b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -239,7 +239,7 @@
if IS_X86_32:
tmpreg = edx
else:
- tmpreg = r11 # edx is used for 3rd argument
+ tmpreg = r10 # edx is used for 3rd argument
mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno))
mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno))
mc.MOV32_mr((tmpreg.value, 0), eax.value)
diff --git a/rpython/jit/backend/x86/codebuf.py
b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -42,10 +42,10 @@
self.ops_offset = {}
def add_pending_relocation(self):
- self.relocations.append(self.get_relative_pos())
+ self.relocations.append(self.get_relative_pos(break_basic_block=False))
def mark_op(self, op):
- pos = self.get_relative_pos()
+ pos = self.get_relative_pos(break_basic_block=False)
self.ops_offset[op] = pos
def copy_to_raw_memory(self, addr):
@@ -64,11 +64,11 @@
def emit_forward_jump_cond(self, cond):
self.J_il8(cond, 0)
- return self.get_relative_pos()
+ return self.get_relative_pos(break_basic_block=False)
def emit_forward_jump_uncond(self):
self.JMP_l8(0)
- return self.get_relative_pos()
+ return self.get_relative_pos(break_basic_block=False)
def patch_forward_jump(self, jcond_location):
offset = self.get_relative_pos() - jcond_location
@@ -76,3 +76,8 @@
if offset > 127:
raise ShortJumpTooFar
self.overwrite(jcond_location-1, chr(offset))
+
+ def get_relative_pos(self, break_basic_block=True):
+ if break_basic_block:
+ self.forget_scratch_register()
+ return BlockBuilderMixin.get_relative_pos(self)
diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py
--- a/rpython/jit/backend/x86/jump.py
+++ b/rpython/jit/backend/x86/jump.py
@@ -77,6 +77,7 @@
assembler.regalloc_pop(dst)
return
assembler.regalloc_mov(src, tmpreg)
+ assembler.mc.forget_scratch_register()
src = tmpreg
assembler.regalloc_mov(src, dst)
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -435,9 +435,9 @@
def consider_guard_not_invalidated(self, op):
mc = self.assembler.mc
- n = mc.get_relative_pos()
+ n = mc.get_relative_pos(break_basic_block=False)
self.perform_guard(op, [], None)
- assert n == mc.get_relative_pos()
+ assert n == mc.get_relative_pos(break_basic_block=False)
# ensure that the next label is at least 5 bytes farther than
# the current position. Otherwise, when invalidating the guard,
# we would overwrite randomly the next label's position.
diff --git a/rpython/jit/backend/x86/regloc.py
b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -4,7 +4,7 @@
from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64
from rpython.tool.sourcetools import func_with_new_name
from rpython.rlib.objectmodel import specialize, instantiate
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, r_uint
from rpython.jit.metainterp.history import FLOAT, INT
from rpython.jit.codewriter import longlong
from rpython.rtyper.lltypesystem import rffi, lltype
@@ -355,7 +355,8 @@
# without an xmm scratch reg.
X86_64_XMM_SCRATCH_REG = xmm15
-unrolling_location_codes = unrolling_iterable(list("rbsmajix"))
+# note: 'r' is after 'i' in this list, for _binaryop()
+unrolling_location_codes = unrolling_iterable(list("irbsmajx"))
@specialize.arg(1)
def _rx86_getattr(obj, methname):
@@ -372,9 +373,7 @@
class LocationCodeBuilder(object):
_mixin_ = True
- _reuse_scratch_register = False # for now, this is always False
- _scratch_register_known = False # for now, this is always False
- _scratch_register_value = 0
+ _scratch_register_value = 0 # 0 means 'unknown'
def _binaryop(name):
@@ -383,7 +382,7 @@
val2 = loc2.value_i()
if name == 'MOV' and isinstance(loc1, RegLoc):
self.MOV_ri(loc1.value, val2)
- return
+ return True
code1 = loc1.location_code()
if code1 == 'j':
checkvalue = loc1.value_j()
@@ -402,10 +401,11 @@
self.MOV_ri(freereg.value, val2)
INSN(self, loc1, freereg)
self.POP_r(freereg.value)
+ return True
else:
# For this case, we should not need the scratch register more
than here.
self._load_scratch(val2)
- INSN(self, loc1, X86_64_SCRATCH_REG)
+ return False
def invoke(self, codes, val1, val2):
methname = name + "_" + codes
@@ -433,15 +433,15 @@
code1 = loc1.location_code()
code2 = loc2.location_code()
- # You can pass in the scratch register as a location, but you
- # must be careful not to combine it with location types that
- # might need to use the scratch register themselves.
- if loc2 is X86_64_SCRATCH_REG:
- if code1 == 'j':
- assert (name.startswith("MOV") and
- rx86.fits_in_32bits(loc1.value_j()))
- if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"):
- assert code2 not in ('j', 'i')
+ # You cannot pass in the scratch register as a location,
+ # except with a MOV instruction.
+ if name.startswith('MOV'):
+ if loc2 is X86_64_SCRATCH_REG:
+ assert code1 != 'j' and code1 != 'm' and code1 != 'a'
+ if loc1 is X86_64_SCRATCH_REG:
+ self.forget_scratch_register()
+ elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG:
+ raise AssertionError("%s with scratch reg specified" % name)
for possible_code2 in unrolling_location_codes:
if not has_implementation_for('?', possible_code2):
@@ -451,8 +451,14 @@
#
# Fake out certain operations for x86_64
if self.WORD == 8 and possible_code2 == 'i' and not
rx86.fits_in_32bits(val2):
- insn_with_64_bit_immediate(self, loc1, loc2)
- return
+ if insn_with_64_bit_immediate(self, loc1, loc2):
+ return # done
+ loc2 = X86_64_SCRATCH_REG
+ code2 = 'r'
+ # NB. unrolling_location_codes contains 'r'
+ # after 'i', so that it will be found after
+ # this iteration
+ continue
#
# Regular case
for possible_code1 in unrolling_location_codes:
@@ -487,6 +493,9 @@
def _unaryop(name):
def INSN(self, loc):
+ if loc is X86_64_SCRATCH_REG:
+ raise AssertionError("%s with scratch reg specified" % name)
+
code = loc.location_code()
for possible_code in unrolling_location_codes:
if code == possible_code:
@@ -532,6 +541,9 @@
else:
methname = name + "_" + possible_code
_rx86_getattr(self, methname)(val)
+ # This is for CALL and JMP, so it's correct to forget
+ # the value of the R11 register here.
+ self.forget_scratch_register()
return func_with_new_name(INSN, "INSN_" + name)
@@ -540,16 +552,18 @@
# If we are within a "reuse_scratch_register" block, we remember the
# last value we loaded to the scratch register and encode the address
# as an offset from that if we can
- if self._scratch_register_known:
- offset = addr - self._scratch_register_value
+ if self._scratch_register_value != 0:
+ offset = r_uint(addr) - r_uint(self._scratch_register_value)
+ offset = intmask(offset)
if rx86.fits_in_32bits(offset):
+ print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % (
+ addr, offset)
return (X86_64_SCRATCH_REG.value, offset)
+ print '_addr_as_reg_offset(%x) [too far]' % (addr,)
# else: fall through
-
- if self._reuse_scratch_register:
- self._scratch_register_known = True
- self._scratch_register_value = addr
-
+ else:
+ print '_addr_as_reg_offset(%x) [new]' % (addr,)
+ self._scratch_register_value = addr
self.MOV_ri(X86_64_SCRATCH_REG.value, addr)
return (X86_64_SCRATCH_REG.value, 0)
@@ -557,12 +571,11 @@
# For cases where an AddressLoc has the location_code 'm', but
# where the static offset does not fit in 32-bits. We have to fall
# back to the X86_64_SCRATCH_REG. Returns a new location encoded
- # as mode 'm' too. These are all possibly rare cases; don't try
- # to reuse a past value of the scratch register at all.
- self._scratch_register_known = False
- self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
+ # as mode 'm' too. These are all possibly rare cases.
+ ofs = self._addr_as_reg_offset(static_offset)
+ self.forget_scratch_register()
self.LEA_ra(X86_64_SCRATCH_REG.value,
- (basereg, X86_64_SCRATCH_REG.value, 0, 0))
+ (basereg, X86_64_SCRATCH_REG.value, 0, ofs))
return (X86_64_SCRATCH_REG.value, 0)
def _fix_static_offset_64_a(self, (basereg, scalereg,
@@ -570,41 +583,48 @@
# For cases where an AddressLoc has the location_code 'a', but
# where the static offset does not fit in 32-bits. We have to fall
# back to the X86_64_SCRATCH_REG. In one case it is even more
- # annoying. These are all possibly rare cases; don't try to reuse a
- # past value of the scratch register at all.
- self._scratch_register_known = False
- self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
+ # annoying. These are all possibly rare cases.
+ ofs = self._addr_as_reg_offset(static_offset)
#
if basereg != rx86.NO_BASE_REGISTER:
+ self.forget_scratch_register()
self.LEA_ra(X86_64_SCRATCH_REG.value,
- (basereg, X86_64_SCRATCH_REG.value, 0, 0))
- return (X86_64_SCRATCH_REG.value, scalereg, scale, 0)
+ (basereg, X86_64_SCRATCH_REG.value, 0, ofs))
+ ofs = 0
+ return (X86_64_SCRATCH_REG.value, scalereg, scale, ofs)
def _load_scratch(self, value):
- if (self._scratch_register_known
- and value == self._scratch_register_value):
- return
- if self._reuse_scratch_register:
- self._scratch_register_known = True
- self._scratch_register_value = value
+ if self._scratch_register_value != 0:
+ if self._scratch_register_value == value:
+ print '_load_scratch(%x) [REUSED]' % (value,)
+ return
+ offset = r_uint(value) - r_uint(self._scratch_register_value)
+ offset = intmask(offset)
+ if rx86.fits_in_32bits(offset):
+ print '_load_scratch(%x) [LEA r11+%d]' % (value, offset)
+ global COUNT_
+ try:
+ COUNT_ += 1
+ except NameError:
+ COUNT_ = 1
+ if COUNT_ % 182 == 0:
+ import pdb;pdb.set_trace()
+ self.LEA_rm(X86_64_SCRATCH_REG.value,
+ (X86_64_SCRATCH_REG.value, offset))
+ self._scratch_register_value = value
+ return
+ print '_load_scratch(%x) [too far]' % (value,)
+ else:
+ print '_load_scratch(%x) [new]' % (value,)
+ self._scratch_register_value = value
self.MOV_ri(X86_64_SCRATCH_REG.value, value)
+ def forget_scratch_register(self):
+ self._scratch_register_value = 0
+
def trap(self):
self.INT3()
- def begin_reuse_scratch_register(self):
- # --NEVER CALLED (only from a specific test)--
- # Flag the beginning of a block where it is okay to reuse the value
- # of the scratch register. In theory we shouldn't have to do this if
- # we were careful to mark all possible targets of a jump or call, and
- # "forget" the value of the scratch register at those positions, but
- # for now this seems safer.
- self._reuse_scratch_register = True
-
- def end_reuse_scratch_register(self):
- self._reuse_scratch_register = False
- self._scratch_register_known = False
-
def _vector_size_choose(name):
def invoke(self, suffix, val1, val2):
methname = name + suffix
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit