Author: Armin Rigo <ar...@tunes.org> Branch: ppc-updated-backend Changeset: r79093:d289f166a3dc Date: 2015-08-20 08:35 -0700 http://bitbucket.org/pypy/pypy/changeset/d289f166a3dc/
Log: start by copying some more logic from x86, and some comments that are more recent than the ppc backend diff --git a/rpython/jit/backend/ppc/arch.py b/rpython/jit/backend/ppc/arch.py --- a/rpython/jit/backend/ppc/arch.py +++ b/rpython/jit/backend/ppc/arch.py @@ -1,50 +1,75 @@ # Constants that depend on whether we are on 32-bit or 64-bit -from rpython.jit.backend.ppc.register import (NONVOLATILES, - NONVOLATILES_FLOAT, - MANAGED_REGS, - MANAGED_FP_REGS) +from rpython.jit.backend.ppc import register as r import sys if sys.maxint == (2**31 - 1): + assert False, "the ppc backend only supports PPC-64 for now" WORD = 4 - DWORD = 2 * WORD + #DWORD = 2 * WORD IS_PPC_32 = True - BACKCHAIN_SIZE = 2 - FPR_SAVE_AREA = len(NONVOLATILES_FLOAT) * DWORD + #BACKCHAIN_SIZE = 2 + #FPR_SAVE_AREA = len(NONVOLATILES_FLOAT) * DWORD else: WORD = 8 - DWORD = 2 * WORD + #DWORD = 2 * WORD IS_PPC_32 = False - BACKCHAIN_SIZE = 6 - FPR_SAVE_AREA = len(NONVOLATILES_FLOAT) * WORD + #BACKCHAIN_SIZE = 6 + #FPR_SAVE_AREA = len(NONVOLATILES_FLOAT) * WORD IS_PPC_64 = not IS_PPC_32 MY_COPY_OF_REGS = 0 -FORCE_INDEX = WORD -GPR_SAVE_AREA = len(NONVOLATILES) * WORD -FLOAT_INT_CONVERSION = WORD +#FORCE_INDEX = WORD +#GPR_SAVE_AREA = len(NONVOLATILES) * WORD +#FLOAT_INT_CONVERSION = WORD MAX_REG_PARAMS = 8 MAX_FREG_PARAMS = 13 # we need at most 5 instructions to load a constant # and one instruction to patch the stack pointer -SIZE_LOAD_IMM_PATCH_SP = 6 +#SIZE_LOAD_IMM_PATCH_SP = 6 -FORCE_INDEX_OFS = (len(MANAGED_REGS) + len(MANAGED_FP_REGS)) * WORD +#FORCE_INDEX_OFS = (len(MANAGED_REGS) + len(MANAGED_FP_REGS)) * WORD -# The JITFRAME_FIXED_SIZE is measured in words, not bytes or bits. -# Follwing the PPC ABI, we are saving: -# - volatile fpr's -# - volatile gpr's -# - vrsave word -# - alignment padding -# - vector register save area (quadword aligned) -# 3 + 27 + 1 + 4 + 1 -JITFRAME_FIXED_SIZE = len(MANAGED_FP_REGS) + len(MANAGED_REGS) + 1 + 4 + 1 + +# +# +--------------------+ <- SP + STD_FRAME_SIZE +# | general registers | +# | save area | +# +--------------------+ <- SP + 120 +# | Local vars (*) | +# +--------------------+ <- SP + 112 +# | Parameter save | +# | area (8 args max) | +# +--------------------+ <- SP + 48 +# | TOC (unused) | +# +--------------------+ <- SP + 40 +# | link ed. (unused) | +# +--------------------+ <- SP + 32 +# | compiler (unused) | +# +--------------------+ <- SP + 24 +# | LR save area | +# +--------------------+ <- SP + 16 +# | CR save (unused) | +# +--------------------+ <- SP + 8 +# | SP back chain | +# +--------------------+ <- SP + +# The local variables area contains only a copy of the 2nd argument +# passed to the machine code function, which is the ll_threadlocal_addr. +# The 1st argument, i.e. the GC-managed jitframe, is stored in the +# register r31. + + +REGISTERS_SAVED = [r.r25, r.r26, r.r27, r.r28, r.r29, r.r30, r.r31] +assert REGISTERS_SAVED == [_r for _r in r.NONVOLATILES + if _r in r.MANAGED_REGS or _r == r.r31] + +STD_FRAME_SIZE_IN_BYTES = 120 + len(REGISTERS_SAVED) * WORD +assert STD_FRAME_SIZE_IN_BYTES % 16 == 0 # offset to LR in BACKCHAIN if IS_PPC_32: LR_BC_OFFSET = WORD else: - LR_BC_OFFSET = 2 * WORD + LR_BC_OFFSET = 16 diff --git a/rpython/jit/backend/ppc/assembler.py b/rpython/jit/backend/ppc/assembler.py --- a/rpython/jit/backend/ppc/assembler.py +++ b/rpython/jit/backend/ppc/assembler.py @@ -1,6 +1,6 @@ import os from rpython.jit.backend.ppc import form -from rpython.jit.backend.ppc import asmfunc +#from rpython.jit.backend.ppc import asmfunc # don't be fooled by the fact that there's some separation between a # generic assembler class and a PPC assembler class... there's diff --git a/rpython/jit/backend/ppc/codebuf.py b/rpython/jit/backend/ppc/codebuf.py deleted file mode 100644 --- a/rpython/jit/backend/ppc/codebuf.py +++ /dev/null @@ -1,5 +0,0 @@ -from rpython.jit.backend.llsupport.asmmemmgr import BlockBuilderMixin - -class MachineCodeBlockWrapper(BlockBuilderMixin): - def __init__(self): - self.init_block_builder() diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py --- a/rpython/jit/backend/ppc/codebuilder.py +++ b/rpython/jit/backend/ppc/codebuilder.py @@ -1073,14 +1073,14 @@ if IS_PPC_64: self.load(r.TOC.value, r.SP.value, 5 * WORD) - def make_function_prologue(self, frame_size): - """ Build a new stackframe of size frame_size - and store the LR in the previous frame. - """ - with scratch_reg(self): - self.store_update(r.SP.value, r.SP.value, -frame_size) - self.mflr(r.SCRATCH.value) - self.store(r.SCRATCH.value, r.SP.value, frame_size + LR_BC_OFFSET) + ## def make_function_prologue(self, frame_size): + ## """ Build a new stackframe of size frame_size + ## and store the LR in the previous frame. + ## """ + ## with scratch_reg(self): + ## self.store_update(r.SP.value, r.SP.value, -frame_size) + ## self.mflr(r.SCRATCH.value) + ## self.store(r.SCRATCH.value, r.SP.value, frame_size + LR_BC_OFFSET) def restore_LR_from_caller_frame(self, frame_size): """ Restore the LR from the calling frame. diff --git a/rpython/jit/backend/ppc/helper/assembler.py b/rpython/jit/backend/ppc/helper/assembler.py --- a/rpython/jit/backend/ppc/helper/assembler.py +++ b/rpython/jit/backend/ppc/helper/assembler.py @@ -1,7 +1,6 @@ import rpython.jit.backend.ppc.condition as c from rpython.rlib.rarithmetic import intmask -from rpython.jit.backend.ppc.arch import (MAX_REG_PARAMS, IS_PPC_32, WORD, - BACKCHAIN_SIZE) +from rpython.jit.backend.ppc.arch import MAX_REG_PARAMS, IS_PPC_32, WORD from rpython.jit.metainterp.history import FLOAT import rpython.jit.backend.ppc.register as r from rpython.rtyper.lltypesystem import rffi, lltype diff --git a/rpython/jit/backend/ppc/opassembler.py b/rpython/jit/backend/ppc/opassembler.py --- a/rpython/jit/backend/ppc/opassembler.py +++ b/rpython/jit/backend/ppc/opassembler.py @@ -5,9 +5,8 @@ import rpython.jit.backend.ppc.register as r from rpython.jit.backend.ppc.locations import imm from rpython.jit.backend.ppc.locations import imm as make_imm_loc -from rpython.jit.backend.ppc.arch import (IS_PPC_32, WORD, BACKCHAIN_SIZE, - MAX_REG_PARAMS, MAX_FREG_PARAMS, - FORCE_INDEX_OFS, JITFRAME_FIXED_SIZE) +from rpython.jit.backend.ppc.arch import (IS_PPC_32, WORD, + MAX_REG_PARAMS, MAX_FREG_PARAMS) from rpython.jit.metainterp.history import (JitCellToken, TargetToken, Box, AbstractFailDescr, FLOAT, INT, REF) diff --git a/rpython/jit/backend/ppc/ppc_assembler.py b/rpython/jit/backend/ppc/ppc_assembler.py --- a/rpython/jit/backend/ppc/ppc_assembler.py +++ b/rpython/jit/backend/ppc/ppc_assembler.py @@ -4,13 +4,8 @@ from rpython.jit.backend.ppc.codebuilder import (PPCBuilder, OverwritingBuilder, scratch_reg) from rpython.jit.backend.ppc.arch import (IS_PPC_32, IS_PPC_64, WORD, - NONVOLATILES, MAX_REG_PARAMS, - GPR_SAVE_AREA, BACKCHAIN_SIZE, - FPR_SAVE_AREA, NONVOLATILES_FLOAT, - FLOAT_INT_CONVERSION, FORCE_INDEX, - SIZE_LOAD_IMM_PATCH_SP, - FORCE_INDEX_OFS, LR_BC_OFFSET, - JITFRAME_FIXED_SIZE) + LR_BC_OFFSET, + STD_FRAME_SIZE_IN_BYTES) from rpython.jit.backend.ppc.helper.assembler import Saved_Volatiles from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg import rpython.jit.backend.ppc.register as r @@ -73,17 +68,17 @@ class AssemblerPPC(OpAssembler, BaseAssembler): - ENCODING_AREA = FORCE_INDEX_OFS - OFFSET_SPP_TO_GPR_SAVE_AREA = (FORCE_INDEX + FLOAT_INT_CONVERSION - + ENCODING_AREA) - OFFSET_SPP_TO_FPR_SAVE_AREA = (OFFSET_SPP_TO_GPR_SAVE_AREA - + GPR_SAVE_AREA) - OFFSET_SPP_TO_OLD_BACKCHAIN = (OFFSET_SPP_TO_GPR_SAVE_AREA - + GPR_SAVE_AREA + FPR_SAVE_AREA) + #ENCODING_AREA = FORCE_INDEX_OFS + #OFFSET_SPP_TO_GPR_SAVE_AREA = (FORCE_INDEX + FLOAT_INT_CONVERSION + # + ENCODING_AREA) + #OFFSET_SPP_TO_FPR_SAVE_AREA = (OFFSET_SPP_TO_GPR_SAVE_AREA + # + GPR_SAVE_AREA) + #OFFSET_SPP_TO_OLD_BACKCHAIN = (OFFSET_SPP_TO_GPR_SAVE_AREA + # + GPR_SAVE_AREA + FPR_SAVE_AREA) - OFFSET_STACK_ARGS = OFFSET_SPP_TO_OLD_BACKCHAIN + BACKCHAIN_SIZE * WORD - if IS_PPC_64: - OFFSET_STACK_ARGS += MAX_REG_PARAMS * WORD + #OFFSET_STACK_ARGS = OFFSET_SPP_TO_OLD_BACKCHAIN + BACKCHAIN_SIZE * WORD + #if IS_PPC_64: + # OFFSET_STACK_ARGS += MAX_REG_PARAMS * WORD def __init__(self, cpu, translate_support_code=False): self.cpu = cpu @@ -676,15 +671,46 @@ fpreg = r.MANAGED_FP_REGS[i] mc.stfd(fpreg.value, r.SPP.value, (i + FLOAT_OFFSET) * WORD) - def gen_bootstrap_code(self, loophead, spilling_area): - self._insert_stack_check() - self._make_frame(spilling_area) - self.mc.b_offset(loophead) + #def gen_bootstrap_code(self, loophead, spilling_area): + # self._insert_stack_check() + # self._make_frame(spilling_area) + # self.mc.b_offset(loophead) - def _insert_stack_check(self): + def _call_header(self): + # Build a new stackframe of size STD_FRAME_SIZE_IN_BYTES + self.mc.store_update(r.SP.value, r.SP.value, -STD_FRAME_SIZE_IN_BYTES) + self.mc.mflr(r.SCRATCH.value) + self.mc.store(r.SCRATCH.value, r.SP.value, + STD_FRAME_SIZE_IN_BYTES + LR_BC_OFFSET) + + XXXX + # save SPP at the bottom of the stack frame + self.mc.store(r.SPP.value, r.SP.value, WORD) + + # compute spilling pointer (SPP) + self.mc.addi(r.SPP.value, r.SP.value, + frame_depth - self.OFFSET_SPP_TO_OLD_BACKCHAIN) + + # save nonvolatile registers + self._save_nonvolatiles() + + # save r31, use r30 as scratch register + # this is safe because r30 has been saved already + assert NONVOLATILES[-1] == r.SPP + ofs_to_r31 = (self.OFFSET_SPP_TO_GPR_SAVE_AREA + + WORD * (len(NONVOLATILES)-1)) + self.mc.load(r.r30.value, r.SP.value, WORD) + self.mc.store(r.r30.value, r.SPP.value, ofs_to_r31) + gcrootmap = self.cpu.gc_ll_descr.gcrootmap + if gcrootmap and gcrootmap.is_shadow_stack: + self.gen_shadowstack_header(gcrootmap) + + def _call_header_with_stack_check(self): + self._call_header() if self.stack_check_slowpath == 0: pass # not translated else: + XXXX # this is the size for the miniframe frame_size = BACKCHAIN_SIZE * WORD @@ -733,24 +759,23 @@ pmc.bc(4, 1, offset) # jump if SCRATCH <= r16, i. e. not(SCRATCH > r16) pmc.overwrite() - def setup(self, looptoken, operations): - self.current_clt = looptoken.compiled_loop_token - operations = self.cpu.gc_ll_descr.rewrite_assembler(self.cpu, - operations, self.current_clt.allgcrefs) - assert self.memcpy_addr != 0 + def setup(self, looptoken): + BaseAssembler.setup(self, looptoken) + assert self.memcpy_addr != 0, "setup_once() not called?" + self.current_clt = looptoken.compiled_loop_token + self.pending_guard_tokens = [] + #if WORD == 8: + # self.pending_memoryerror_trampoline_from = [] + # self.error_trampoline_64 = 0 self.mc = PPCBuilder() - self.pending_guards = [] + #assert self.datablockwrapper is None --- but obscure case + # possible, e.g. getting MemoryError and continuing allblocks = self.get_asmmemmgr_blocks(looptoken) self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, allblocks) - self.max_stack_params = 0 self.target_tokens_currently_compiling = {} self.frame_depth_to_patch = [] - self._finish_gcmap = lltype.nullptr(jitframe.GCMAP) - return operations # do we really need this return? - - def setup_once(self): - BaseAssembler.setup_once(self) + #self.max_stack_params = 0 def _append_debugging_code(self, operations, tp, number, token): counter = self._register_counter(tp, number, token) @@ -788,77 +813,74 @@ baseofs = self.cpu.get_baseofs_of_frame_field() self.current_clt.frame_info.update_frame_depth(baseofs, frame_depth) - def assemble_loop(self, loopname, inputargs, operations, looptoken, log): + @rgc.no_release_gil + def assemble_loop(self, jd_id, unique_id, logger, loopname, inputargs, + operations, looptoken, log): clt = CompiledLoopToken(self.cpu, looptoken.number) - clt.allgcrefs = [] looptoken.compiled_loop_token = clt clt._debug_nbargs = len(inputargs) - if not we_are_translated(): + # Arguments should be unique assert len(set(inputargs)) == len(inputargs) - operations = self.setup(looptoken, operations) + self.setup(looptoken) + frame_info = self.datablockwrapper.malloc_aligned( + jitframe.JITFRAMEINFO_SIZE, alignment=WORD) + clt.frame_info = rffi.cast(jitframe.JITFRAMEINFOPTR, frame_info) + clt.allgcrefs = [] + clt.frame_info.clear() # for now if log: operations = self._inject_debugging_code(looptoken, operations, 'e', looptoken.number) - self.startpos = self.mc.currpos() regalloc = Regalloc(assembler=self) + # + self._call_header_with_stack_check() + operations = regalloc.prepare_loop(inputargs, operations, + looptoken, clt.allgcrefs) + looppos = self.mc.get_relative_pos() + frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, + operations) + self.update_frame_depth(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE) + # + size_excluding_failure_stuff = self.mc.get_relative_pos() + self.write_pending_failure_recoveries() + full_size = self.mc.get_relative_pos() + # + rawstart = self.materialize_loop(looptoken) + self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE, + rawstart) + looptoken._ll_loop_code = looppos + rawstart + debug_start("jit-backend-addr") + debug_print("Loop %d (%s) has address 0x%x to 0x%x (bootstrap 0x%x)" % ( + looptoken.number, loopname, + r_uint(rawstart + looppos), + r_uint(rawstart + size_excluding_failure_stuff), + r_uint(rawstart))) + debug_stop("jit-backend-addr") + self.patch_pending_failure_recoveries(rawstart) + # + ops_offset = self.mc.ops_offset + if not we_are_translated(): + # used only by looptoken.dump() -- useful in tests + looptoken._x86_rawstart = rawstart + looptoken._x86_fullsize = full_size + looptoken._x86_ops_offset = ops_offset + looptoken._ll_function_addr = rawstart + if logger: + logger.log_loop(inputargs, operations, 0, "rewritten", + name=loopname, ops_offset=ops_offset) - regalloc.prepare_loop(inputargs, operations, looptoken) - - start_pos = self.mc.currpos() - looptoken._ppc_loop_code = start_pos - clt.frame_depth = clt.param_depth = -1 - spilling_area, param_depth = self._assemble(operations, regalloc) - size_excluding_failure_stuff = self.mc.currpos() - clt.frame_depth = spilling_area - clt.param_depth = param_depth - frame_info = self.datablockwrapper.malloc_aligned(jitframe.JITFRAMEINFO_SIZE, - alignment=WORD) - clt.frame_info = rffi.cast(jitframe.JITFRAMEINFOPTR, frame_info) - - direct_bootstrap_code = self.mc.currpos() - frame_depth = self.compute_frame_depth(spilling_area, param_depth) - frame_depth += JITFRAME_FIXED_SIZE - self.update_frame_depth(frame_depth) - self.gen_bootstrap_code(start_pos, frame_depth) - - self.write_pending_failure_recoveries() - if IS_PPC_64: - fdescr = self.gen_64_bit_func_descr() - - # write instructions to memory - loop_start = self.materialize_loop(looptoken, False) - self.fixup_target_tokens(loop_start) - - real_start = loop_start + direct_bootstrap_code - if IS_PPC_32: - looptoken._ppc_func_addr = looptoken._ll_function_addr = real_start - else: - self.write_64_bit_func_descr(fdescr, real_start) - looptoken._ppc_func_addr = looptoken._ll_function_addr = fdescr - - self.process_pending_guards(loop_start) - - if log and not we_are_translated(): - self.mc._dump_trace(real_start, - 'loop.asm') - - ops_offset = self.mc.ops_offset + self.fixup_target_tokens(rawstart) self.teardown() - - debug_start("jit-backend-addr") - debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % ( - looptoken.number, loopname, - real_start, - real_start + size_excluding_failure_stuff, - loop_start)) - debug_stop("jit-backend-addr") - - return AsmInfo(ops_offset, loop_start, - size_excluding_failure_stuff - start_pos) + # oprofile support + if self.cpu.profile_agent is not None: + name = "Loop # %s: %s" % (looptoken.number, loopname) + self.cpu.profile_agent.native_code_written(name, + rawstart, full_size) + return AsmInfo(ops_offset, rawstart + looppos, + size_excluding_failure_stuff - looppos) def _assemble(self, operations, regalloc): regalloc.compute_hint_frame_locations(operations) diff --git a/rpython/jit/backend/ppc/register.py b/rpython/jit/backend/ppc/register.py --- a/rpython/jit/backend/ppc/register.py +++ b/rpython/jit/backend/ppc/register.py @@ -17,8 +17,10 @@ VOLATILES = [r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12] # volatile r2 is persisted around calls and r13 can be ignored -NONVOLATILES_FLOAT = [f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, - f24, f25, f26, f27, f28, f29, f30, f31] +# we don't use any non-volatile float register, to keep the frame header +# code short-ish +#NONVOLATILES_FLOAT = [f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, +# f24, f25, f26, f27, f28, f29, f30, f31] VOLATILES_FLOAT = [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13] SCRATCH = r0 @@ -28,12 +30,12 @@ RES = r3 SPP = r31 -MANAGED_REGS = [r3, r4, r5, r6, r7, r8, r9, r10, - r11, r12, r14, r15, r16, r17, r18, - r19, r20, r21, r22, r23, r24, r25, r26, - r27, r28, r29, r30] +MANAGED_REGS = [r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, + r25, r26, r27, r28, r29, r30] + # registers r14 to r24 are not touched, we have enough + # registers already -MANAGED_FP_REGS = VOLATILES_FLOAT[1:] + NONVOLATILES_FLOAT +MANAGED_FP_REGS = VOLATILES_FLOAT[1:] #+ NONVOLATILES_FLOAT PARAM_REGS = [r3, r4, r5, r6, r7, r8, r9, r10] PARAM_FPREGS = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13] diff --git a/rpython/jit/backend/ppc/runner.py b/rpython/jit/backend/ppc/runner.py --- a/rpython/jit/backend/ppc/runner.py +++ b/rpython/jit/backend/ppc/runner.py @@ -1,7 +1,7 @@ import py from rpython.rtyper.lltypesystem import lltype, llmemory, rffi from rpython.rtyper.llinterp import LLInterpreter -from rpython.jit.backend.ppc.arch import FORCE_INDEX_OFS +#from rpython.jit.backend.ppc.arch import FORCE_INDEX_OFS from rpython.jit.backend.llsupport.llmodel import AbstractLLCPU from rpython.jit.backend.ppc.ppc_assembler import AssemblerPPC from rpython.jit.backend.ppc.arch import WORD @@ -43,10 +43,6 @@ def finish_once(self): self.assembler.finish_once() - def compile_loop(self, inputargs, operations, looptoken, log=True, name=""): - return self.assembler.assemble_loop(name, inputargs, - operations, looptoken, log) - def compile_bridge(self, faildescr, inputargs, operations, original_loop_token, log=False): clt = original_loop_token.compiled_loop_token @@ -54,12 +50,6 @@ return self.assembler.assemble_bridge(faildescr, inputargs, operations, original_loop_token, log=log) - def clear_latest_values(self, count): - setitem = self.assembler.fail_boxes_ptr.setitem - null = lltype.nullptr(llmemory.GCREF.TO) - for index in range(count): - setitem(index, null) - @staticmethod def cast_ptr_to_int(x): adr = llmemory.cast_ptr_to_adr(x) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit