Author: Richard Plangger <planri...@gmail.com> Branch: ppc-vsx-support Changeset: r85962:7673f44c3693 Date: 2016-08-01 19:04 +0200 http://bitbucket.org/pypy/pypy/changeset/7673f44c3693/
Log: impl. flush_vector_cc for x86 using PBLENDVB diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -108,9 +108,20 @@ single_neg_const = '\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80' zero_const = '\x00' * 16 # + two_64bit_ones = '\x01\x00\x00\x00\x00\x00\x00\x00' * 2 + four_32bit_ones = '\x01\x00\x00\x00' * 4 + eight_16bit_ones = '\x01\x00' * 8 + sixteen_8bit_ones = '\x01' * 16 + + + + + + # data = neg_const + abs_const + \ single_neg_const + single_abs_const + \ - zero_const + zero_const + sixteen_8bit_ones + eight_16bit_ones + \ + four_32bit_ones + two_64bit_ones datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, []) float_constants = datablockwrapper.malloc_aligned(len(data), alignment=16) datablockwrapper.done() @@ -122,6 +133,7 @@ self.single_float_const_neg_addr = float_constants + 32 self.single_float_const_abs_addr = float_constants + 48 self.expand_byte_mask_addr = float_constants + 64 + self.element_ones = [float_constants + 80 + 16*i for i in range(4)] def set_extra_stack_depth(self, mc, value): if self._is_asmgcc(): diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py --- a/rpython/jit/backend/x86/rx86.py +++ b/rpython/jit/backend/x86/rx86.py @@ -793,6 +793,7 @@ PTEST_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x17', register(1,8), register(2), '\xC0') PBLENDW_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x0E', register(1,8), register(2), '\xC0', immediate(3, 'b')) + PBLENDVB_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x10', register(1,8), register(2), '\xC0') CMPPD_xxi = xmminsn('\x66', rex_nw, '\x0F\xC2', register(1,8), register(2), '\xC0', immediate(3, 'b')) CMPPS_xxi = xmminsn( rex_nw, '\x0F\xC2', register(1,8), register(2), '\xC0', immediate(3, 'b')) diff --git a/rpython/jit/backend/x86/vector_ext.py b/rpython/jit/backend/x86/vector_ext.py --- a/rpython/jit/backend/x86/vector_ext.py +++ b/rpython/jit/backend/x86/vector_ext.py @@ -10,7 +10,7 @@ xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG, AddressLoc) from rpython.jit.backend.llsupport.vector_ext import VectorExt -from rpython.jit.backend.llsupport.regalloc import get_scale +from rpython.jit.backend.llsupport.regalloc import get_scale, TempVar from rpython.jit.metainterp.resoperation import (rop, ResOperation, VectorOp, VectorGuardOp) from rpython.rlib.objectmodel import we_are_translated, always_inline @@ -33,6 +33,14 @@ raise NotImplementedError(msg) # DUP END +class TempVector(TempVar): + def __init__(self, type): + self.type = type + def is_vector(self): + return True + def __repr__(self): + return "<TempVector At %s>" % (id(self),) + class X86VectorExt(VectorExt): def setup_once(self, asm): if detect_feature.detect_sse4_1(): @@ -292,29 +300,50 @@ self.mc.XORPD(src, heap(self.float_const_neg_addr)) def genop_vec_float_eq(self, op, arglocs, resloc): - _, rhsloc, sizeloc = arglocs + lhsloc, rhsloc, sizeloc = arglocs size = sizeloc.value if size == 4: - self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 0) # 0 means equal + self.mc.CMPPS_xxi(lhsloc.value, rhsloc.value, 0) # 0 means equal else: - self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 0) + self.mc.CMPPD_xxi(lhsloc.value, rhsloc.value, 0) + self.flush_vec_cc(rx86.Conditions["E"], lhsloc, resloc, sizeloc.value) + + def flush_vec_cc(self, rev_cond, lhsloc, resloc, size): + # After emitting an instruction that leaves a boolean result in + # a condition code (cc), call this. In the common case, result_loc + # will be set to SPP by the regalloc, which in this case means + # "propagate it between this operation and the next guard by keeping + # it in the cc". In the uncommon case, result_loc is another + # register, and we emit a load from the cc into this register. + + if resloc is ebp: + self.guard_success_cc = condition + else: + assert lhsloc is xmm0 + maskloc = X86_64_XMM_SCRATCH_REG + self.mc.MOVAPD(maskloc, heap(self.element_ones[get_scale(size)])) + self.mc.PXOR(resloc, resloc) + # note that xmm0 contains true false for each element by the last compare operation + self.mc.PBLENDVB_xx(resloc.value, maskloc.value) def genop_vec_float_ne(self, op, arglocs, resloc): - _, rhsloc, sizeloc = arglocs + lhsloc, rhsloc, sizeloc = arglocs size = sizeloc.value # b(100) == 1 << 2 means not equal if size == 4: - self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 1 << 2) + self.mc.CMPPS_xxi(lhsloc.value, rhsloc.value, 1 << 2) else: - self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 1 << 2) + self.mc.CMPPD_xxi(lhsloc.value, rhsloc.value, 1 << 2) + self.flush_vec_cc(rx86.Conditions("NE"), lhsloc, resloc, sizeloc.value) def genop_vec_int_eq(self, op, arglocs, resloc): - _, rhsloc, sizeloc = arglocs + lhsloc, rhsloc, sizeloc = arglocs size = sizeloc.value - self.mc.PCMPEQ(resloc, rhsloc, size) + self.mc.PCMPEQ(lhsloc, rhsloc, size) + self.flush_vec_cc(rx86.Conditions("E"), lhsloc, resloc, sizeloc.value) def genop_vec_int_ne(self, op, arglocs, resloc): - _, rhsloc, sizeloc = arglocs + lhsloc, rhsloc, sizeloc = arglocs size = sizeloc.value self.mc.PCMPEQ(resloc, rhsloc, size) temp = X86_64_XMM_SCRATCH_REG @@ -325,6 +354,7 @@ # 11 11 11 11 # ----------- pxor # 00 11 00 00 + self.flush_vec_cc(rx86.Conditions("NE"), lhsloc, resloc, sizeloc.value) def genop_vec_int_signext(self, op, arglocs, resloc): srcloc, sizeloc, tosizeloc = arglocs @@ -599,9 +629,55 @@ lhs = op.getarg(0) assert isinstance(lhs, VectorOp) args = op.getarglist() + # we need to use xmm0 + lhsloc = self.enforce_var_in_vector_reg(op.getarg(0), args, selected_reg=xmm0) rhsloc = self.make_sure_var_in_reg(op.getarg(1), args) - lhsloc = self.xrm.force_result_in_reg(op, op.getarg(0), args) - self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc) + resloc = self.force_allocate_vector_reg_or_cc(op) + self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], resloc) + + def enforce_var_in_vector_reg(self, arg, forbidden_vars, selected_reg): + """ Enforce the allocation in a specific register. This can even be a forbidden + register. If it is forbidden, it will be moved to another register. + Use with caution, currently this is only used for the vectorization backend + instructions. + """ + xrm = self.xrm + if selected_reg not in xrm.free_regs: + variable = None + candidate_to_spill = None + for var, reg in self.xrm.reg_bindings.items(): + if reg is selected_reg: + variable = var + else: + if var not in forbidden_vars: + candidate_to_spill = var + # do we have a free register? + if len(xrm.free_regs) == 0: + # spill a non forbidden variable + self._spill_var(candidate_to_spill, forbidden_vars, None) + loc = xrm.free_regs.pop() + self.assembler.mov(selected_reg, loc) + reg = xrm.reg_bindings.get(arg, None) + if reg: + xrm.free_regs.append(reg) + self.assembler.mov(reg, selected_reg) + xrm.reg_bindings[arg] = selected_reg + xrm.reg_bindings[variable] = loc + + return selected_reg + return self.make_sure_var_in_reg(arg, forbidden_vars, selected_reg=selected_reg) + + def force_allocate_vector_reg_or_cc(self, var): + assert var.type == INT + if self.next_op_can_accept_cc(self.operations, self.rm.position): + # hack: return the ebp location to mean "lives in CC". This + # ebp will not actually be used, and the location will be freed + # after the next op as usual. + self.xrm.force_allocate_frame_reg(var) + return ebp + else: + # else, return a regular register (not ebp). + return self.xrm.force_allocate_reg(var) consider_vec_float_ne = consider_vec_float_eq consider_vec_int_eq = consider_vec_float_eq _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit