Author: David Schneider <[email protected]>
Branch: arm-backed-float
Changeset: r44285:2cccfc701168
Date: 2011-05-18 16:48 +0200
http://bitbucket.org/pypy/pypy/changeset/2cccfc701168/

Log:    (bivab, arigo) start implementing shadowstack support on arm
        backend. One test passes, yay

diff --git a/pypy/jit/backend/arm/arch.py b/pypy/jit/backend/arm/arch.py
--- a/pypy/jit/backend/arm/arch.py
+++ b/pypy/jit/backend/arm/arch.py
@@ -2,8 +2,14 @@
 from pypy.rlib.rarithmetic import r_uint
 from pypy.rpython.lltypesystem import lltype
 
+
 FUNC_ALIGN=8
 WORD=4
+
+# the number of registers that we need to save around malloc calls
+N_REGISTERS_SAVED_BY_MALLOC = 9
+# the offset from the FP where the list of the registers mentioned above starts
+MY_COPY_OF_REGS = WORD
 # The Address in the PC points two words befind the current instruction
 PC_OFFSET = 8
 
diff --git a/pypy/jit/backend/arm/assembler.py 
b/pypy/jit/backend/arm/assembler.py
--- a/pypy/jit/backend/arm/assembler.py
+++ b/pypy/jit/backend/arm/assembler.py
@@ -2,7 +2,7 @@
 from pypy.jit.backend.arm import conditions as c
 from pypy.jit.backend.arm import locations
 from pypy.jit.backend.arm import registers as r
-from pypy.jit.backend.arm.arch import WORD, FUNC_ALIGN, PC_OFFSET
+from pypy.jit.backend.arm.arch import WORD, FUNC_ALIGN, PC_OFFSET, 
N_REGISTERS_SAVED_BY_MALLOC
 from pypy.jit.backend.arm.codebuilder import ARMv7Builder, OverwritingBuilder
 from pypy.jit.backend.arm.regalloc import (Regalloc, ARMFrameManager,
                                                     _check_imm_arg, TempInt, 
TempPtr)
@@ -71,14 +71,28 @@
         self.malloc_str_func_addr = 0
         self.malloc_unicode_func_addr = 0
         self.memcpy_addr = 0
-        self.teardown()
+        self.guard_descrs = None
         self._exit_code_addr = 0
+        self.current_clt = None
+        self.malloc_slowpath = 0
+        self._regalloc = None
+        self.datablockwrapper = None
 
-    def setup(self):
+    def setup(self, looptoken):
         assert self.memcpy_addr != 0, 'setup_once() not called?'
+        self.current_clt = looptoken.compiled_loop_token
         self.mc = ARMv7Builder()
         self.guard_descrs = []
         self.blocks = []
+        self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
+                                                            self.blocks)
+
+    def teardown(self):
+        self.current_clt = None
+        self._regalloc = None
+        self.datablockwrapper.done()
+        self.mc = None
+        self.guard_descrs = None
 
     def setup_once(self):
         # Addresses of functions called by new_xxx operations
@@ -241,6 +255,31 @@
         mem[i+2] = chr((n >> 16) & 0xFF)
         mem[i+3] = chr((n >> 24) & 0xFF)
 
+    def _build_malloc_slowpath(self):
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        mc = ARMv7Builder()
+        assert gcrootmap is not None and gcrootmap.is_shadow_stack
+        # ---- shadowstack ----
+        for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
+            mc.MOV_br(ofs, reg.value)
+        mc.SUB_ri(esp.value, 16 - WORD)      # stack alignment of 16 bytes
+        if IS_X86_32:
+            mc.MOV_sr(0, edx.value)          # push argument
+        elif IS_X86_64:
+            mc.MOV_rr(edi.value, edx.value)
+        mc.CALL(imm(addr))
+        mc.ADD_ri(esp.value, 16 - WORD)
+        for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
+            mc.MOV_rb(reg.value, ofs)
+        if self.cpu.supports_floats:          # restore the XMM registers
+            for i in range(self.cpu.NUM_REGS):# from where they were saved
+                mc.MOVSD_xs(i, (WORD*2)+8*i)
+        nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
+        mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
+        mc.RET()
+        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        self.malloc_slowpath2 = rawstart
+
     def _gen_leave_jitted_hook_code(self, save_exc=False):
         mc = ARMv7Builder()
         # XXX add a check if cpu supports floats 
@@ -282,10 +321,7 @@
         # 1 separator byte
         # 4 bytes for the faildescr
         memsize = (len(arglocs)-1)*6+5
-        datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
-                                                    self.blocks)
-        memaddr = datablockwrapper.malloc_aligned(memsize, alignment=WORD)
-        datablockwrapper.done()
+        memaddr = self.datablockwrapper.malloc_aligned(memsize, alignment=WORD)
         mem = rffi.cast(rffi.CArrayPtr(lltype.Char), memaddr)
         i = 0
         j = 0
@@ -344,16 +380,19 @@
         if mc is None:
             mc = self.mc
         mc.MOV_rr(r.sp.value, r.fp.value, cond=cond)
-        mc.ADD_ri(r.sp.value, r.sp.value, WORD, cond=cond)
+        mc.ADD_ri(r.sp.value, r.sp.value, 
(N_REGISTERS_SAVED_BY_MALLOC+1)*WORD, cond=cond)
         if self.cpu.supports_floats:
-            mc.VPOP([reg.value for reg in r.callee_saved_vfp_registers])
+            mc.VPOP([reg.value for reg in r.callee_saved_vfp_registers], 
cond=cond)
         mc.POP([reg.value for reg in r.callee_restored_registers], cond=cond)
 
     def gen_func_prolog(self):
         self.mc.PUSH([reg.value for reg in r.callee_saved_registers])
         if self.cpu.supports_floats:
             self.mc.VPUSH([reg.value for reg in r.callee_saved_vfp_registers])
-        self.mc.SUB_ri(r.sp.value, r.sp.value,  WORD)
+        # here we modify the stack pointer to leave room for the 9 registers
+        # that are going to be saved here around malloc calls and one word to
+        # store the force index
+        self.mc.SUB_ri(r.sp.value, r.sp.value,  
(N_REGISTERS_SAVED_BY_MALLOC+1)*WORD)
         self.mc.MOV_rr(r.fp.value, r.sp.value)
 
     def gen_bootstrap_code(self, nonfloatlocs, floatlocs, inputargs):
@@ -468,12 +507,14 @@
     # cpu interface
     def assemble_loop(self, inputargs, operations, looptoken, log):
         self._dump(operations)
-        self.setup()
+
+        clt = CompiledLoopToken(self.cpu, looptoken.number)
+        looptoken.compiled_loop_token = clt
+
+        self.setup(looptoken)
         longevity = compute_vars_longevity(inputargs, operations)
         regalloc = Regalloc(longevity, assembler=self, 
frame_manager=ARMFrameManager())
 
-        clt = CompiledLoopToken(self.cpu, looptoken.number)
-        looptoken.compiled_loop_token = clt
 
         self.align()
         self.gen_func_prolog()
@@ -509,7 +550,7 @@
     def assemble_bridge(self, faildescr, inputargs, operations,
                                                     original_loop_token, log):
         self._dump(operations, 'bridge')
-        self.setup()
+        self.setup(original_loop_token)
         assert isinstance(faildescr, AbstractFailDescr)
         code = faildescr._failure_recovery_code
         enc = rffi.cast(rffi.CCHARP, code)
@@ -548,11 +589,6 @@
         for descr in self.guard_descrs:
             descr._arm_block_start = block_start
 
-    def teardown(self):
-        self.mc = None
-        self.guard_descrs = None
-        #self.looppos = -1
-        #self.currently_compiling_loop = None
 
     def get_asmmemmgr_blocks(self, looptoken):
         clt = looptoken.compiled_loop_token
@@ -600,7 +636,7 @@
 
     def _walk_operations(self, operations, regalloc):
         fcond=c.AL
-        self._insert_checks()
+        self._regalloc = regalloc
         while regalloc.position() < len(operations) - 1:
             regalloc.next_instruction()
             i = regalloc.position()
@@ -621,7 +657,6 @@
                 regalloc.possibly_free_var(op.result)
             regalloc.possibly_free_vars_for_op(op)
             regalloc._check_invariants()
-            self._insert_checks()
 
     def can_merge_with_next_guard(self, op, i, operations):
         num = op.getopnum()
@@ -791,6 +826,75 @@
     def leave_jitted_hook(self):
         pass
 
+    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, tid):
+        size = max(size, self.cpu.gc_ll_descr.minimal_size_in_nursery)
+        size = (size + WORD-1) & ~(WORD-1)     # round up
+
+        self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.r0.value, r.r0.value)
+
+        self.mc.ADD_ri(r.ip.value, r.r0.value, size)
+
+        # XXX maybe use an offset from the valeu nursery_free_addr
+        self.mc.gen_load_int(r.r1.value, nursery_top_adr)
+        self.mc.LDR_ri(r.r1.value, r.r1.value)
+
+        self.mc.CMP_rr(r.ip.value, r.r1.value)
+
+        fast_jmp_pos = self.mc.currpos()
+        self.mc.NOP()
+
+        # XXX update
+        # See comments in _build_malloc_slowpath for the
+        # details of the two helper functions that we are calling below.
+        # First, we need to call two of them and not just one because we
+        # need to have a mark_gc_roots() in between.  Then the calling
+        # convention of slowpath_addr{1,2} are tweaked a lot to allow
+        # the code here to be just two CALLs: slowpath_addr1 gets the
+        # size of the object to allocate from (EDX-EAX) and returns the
+        # result in EAX; slowpath_addr2 additionally returns in EDX a
+        # copy of heap(nursery_free_adr), so that the final MOV below is
+        # a no-op.
+
+        self.mark_gc_roots(self.write_new_force_index(),
+                           use_copy_area=True)
+        slowpath_addr2 = self.malloc_slowpath
+        self.mc.BL(slowpath_addr2)
+
+        offset = self.mc.currpos() - fast_jmp_pos
+        pmc = OverwritingBuilder(self.mc, fast_jmp_pos, WORD)
+        pmc.ADD_ri(r.pc.value, r.pc.value, offset - PC_OFFSET, cond=c.LS)
+
+        self.mc.gen_load_int(r.r1.value, nursery_free_adr)
+        self.mc.STR_ri(r.ip.value, r.r1.value)
+
+        self.mc.gen_load_int(r.ip.value, tid)
+        self.mc.STR_ri(r.ip.value, r.r0.value)
+
+
+    def mark_gc_roots(self, force_index, use_copy_area=False):
+        if force_index < 0:
+            return     # not needed
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
+            mark = self._regalloc.get_mark_gc_roots(gcrootmap, use_copy_area)
+            assert gcrootmap.is_shadow_stack
+            gcrootmap.write_callshape(mark, force_index)
+
+    def write_new_force_index(self):
+        # for shadowstack only: get a new, unused force_index number and
+        # write it to FORCE_INDEX_OFS.  Used to record the call shape
+        # (i.e. where the GC pointers are in the stack) around a CALL
+        # instruction that doesn't already have a force_index.
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            clt = self.current_clt
+            force_index = clt.reserve_and_record_some_faildescr_index()
+            self._write_fail_index(force_index)
+            return force_index
+        else:
+            return 0
+
 def make_operation_list():
     def notimplemented(self, op, arglocs, regalloc, fcond):
         raise NotImplementedError, op
diff --git a/pypy/jit/backend/arm/regalloc.py b/pypy/jit/backend/arm/regalloc.py
--- a/pypy/jit/backend/arm/regalloc.py
+++ b/pypy/jit/backend/arm/regalloc.py
@@ -10,12 +10,14 @@
                                                     prepare_float_op,
                                                     _check_imm_arg)
 from pypy.jit.backend.arm.jump import remap_frame_layout_mixed
+from pypy.jit.backend.arm.arch import MY_COPY_OF_REGS, WORD
 from pypy.jit.codewriter import longlong
 from pypy.jit.metainterp.history import (Const, ConstInt, ConstFloat, ConstPtr,
                                         Box, BoxInt, BoxPtr, AbstractFailDescr,
                                         INT, REF, FLOAT, LoopToken)
 from pypy.jit.metainterp.resoperation import rop
-from pypy.jit.backend.llsupport.descr import BaseFieldDescr, BaseArrayDescr
+from pypy.jit.backend.llsupport.descr import BaseFieldDescr, BaseArrayDescr, \
+                                             BaseCallDescr, BaseSizeDescr
 from pypy.jit.backend.llsupport import symbolic
 from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
@@ -90,6 +92,18 @@
     no_lower_byte_regs    = all_regs
     save_around_call_regs = r.caller_resp
 
+    REGLOC_TO_COPY_AREA_OFS = {
+        r.r2: MY_COPY_OF_REGS + 0 * WORD,
+        r.r3: MY_COPY_OF_REGS + 1 * WORD,
+        r.r4: MY_COPY_OF_REGS + 2 * WORD,
+        r.r5: MY_COPY_OF_REGS + 3 * WORD,
+        r.r6: MY_COPY_OF_REGS + 4 * WORD,
+        r.r7: MY_COPY_OF_REGS + 5 * WORD,
+        r.r8: MY_COPY_OF_REGS + 6 * WORD,
+        r.r9: MY_COPY_OF_REGS + 7 * WORD,
+        r.r10: MY_COPY_OF_REGS + 8 * WORD,
+    }
+
     def __init__(self, longevity, frame_manager=None, assembler=None):
         RegisterManager.__init__(self, longevity, frame_manager, assembler)
 
@@ -801,11 +815,15 @@
         return [argloc, resloc]
 
     def prepare_op_new(self, op, fcond):
-        arglocs = self._prepare_args_for_new_op(op.getdescr())
-        self.assembler._emit_call(self.assembler.malloc_func_addr,
-                                arglocs, self, result=op.result)
-        self.possibly_free_vars(arglocs)
-        self.possibly_free_var(op.result)
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        if gc_ll_descr.can_inline_malloc(op.getdescr()):
+            self.fastpath_malloc_fixedsize(op, op.getdescr())
+        else:
+            arglocs = self._prepare_args_for_new_op(op.getdescr())
+            self.assembler._emit_call(self.assembler.malloc_func_addr,
+                                    arglocs, self, result=op.result)
+            self.possibly_free_vars(arglocs)
+            self.possibly_free_var(op.result)
         return []
 
     def prepare_op_new_with_vtable(self, op, fcond):
@@ -821,12 +839,69 @@
     def prepare_op_new_array(self, op, fcond):
         gc_ll_descr = self.cpu.gc_ll_descr
         if gc_ll_descr.get_funcptr_for_newarray is not None:
-            raise NotImplementedError
+            # framework GC
+            box_num_elem = op.getarg(0)
+            if isinstance(box_num_elem, ConstInt):
+                num_elem = box_num_elem.value
+                if gc_ll_descr.can_inline_malloc_varsize(op.getdescr(),
+                                                         num_elem):
+                    self.fastpath_malloc_varsize(op, op.getdescr(), num_elem)
+                    return
+            args = self.assembler.cpu.gc_ll_descr.args_for_new_array(
+                op.getdescr())
+            arglocs = [imm(x) for x in args]
+            arglocs.append(self.loc(box_num_elem))
+            self._call(op, arglocs)
         # boehm GC
         itemsize, scale, basesize, ofs_length, _ = (
             self._unpack_arraydescr(op.getdescr()))
         return self._malloc_varsize(basesize, ofs_length, itemsize, op)
 
+    def fastpath_malloc_varsize(self, op, arraydescr, num_elem):
+        assert isinstance(arraydescr, BaseArrayDescr)
+        ofs_length = arraydescr.get_ofs_length(self.cpu.translate_support_code)
+        basesize = arraydescr.get_base_size(self.cpu.translate_support_code)
+        itemsize = arraydescr.get_item_size(self.cpu.translate_support_code)
+        size = basesize + itemsize * num_elem
+        self._do_fastpath_malloc(op, size, arraydescr.tid)
+        self.assembler.set_new_array_length(eax, ofs_length, imm(num_elem))
+
+    def fastpath_malloc_fixedsize(self, op, descr):
+        assert isinstance(descr, BaseSizeDescr)
+        self._do_fastpath_malloc(op, descr.size, descr.tid)
+
+    def _do_fastpath_malloc(self, op, size, tid):
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+        t = TempInt()
+        self.rm.force_allocate_reg(t, selected_reg=r.r1)
+        self.possibly_free_var(op.result)
+        self.possibly_free_var(t)
+
+        self.assembler.malloc_cond(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            size, tid,
+            )
+
+    def get_mark_gc_roots(self, gcrootmap, use_copy_area=False):
+        shape = gcrootmap.get_basic_shape(False)
+        for v, val in self.frame_manager.frame_bindings.items():
+            if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
+                assert isinstance(val, StackLoc)
+                gcrootmap.add_frame_offset(shape, val.position)
+        for v, reg in self.rm.reg_bindings.items():
+            if reg is r.r0:
+                continue
+            if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
+                if use_copy_area:
+                    assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
+                    area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
+                    gcrootmap.add_frame_offset(shape, area_offset)
+                else:
+                    assert 0, 'sure??'
+        return gcrootmap.compress_callshape(shape,
+                                            self.assembler.datablockwrapper)
     def prepare_op_newstr(self, op, fcond):
         gc_ll_descr = self.cpu.gc_ll_descr
         if gc_ll_descr.get_funcptr_for_newstr is not None:
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to