Author: Richard Plangger <planri...@gmail.com> Branch: zarch-simd-support Changeset: r87069:29bd4e207e97 Date: 2016-09-13 10:59 +0200 http://bitbucket.org/pypy/pypy/changeset/29bd4e207e97/
Log: copied over vector_ext file, some minor modifications diff --git a/rpython/jit/backend/zarch/vector_ext.py b/rpython/jit/backend/zarch/vector_ext.py --- a/rpython/jit/backend/zarch/vector_ext.py +++ b/rpython/jit/backend/zarch/vector_ext.py @@ -1,2 +1,842 @@ +import py +from rpython.jit.metainterp.compile import ResumeGuardDescr +from rpython.jit.metainterp.history import (ConstInt, INT, REF, + FLOAT, VECTOR, TargetToken) +from rpython.jit.backend.llsupport.descr import (ArrayDescr, CallDescr, + unpack_arraydescr, unpack_fielddescr, unpack_interiorfielddescr) +from rpython.jit.backend.llsupport.regalloc import get_scale +from rpython.jit.metainterp.resoperation import (rop, ResOperation, + VectorOp, VectorGuardOp) +from rpython.rlib.objectmodel import we_are_translated +from rpython.rtyper.lltypesystem.lloperation import llop +from rpython.rtyper.lltypesystem import lltype +from rpython.jit.backend.ppc.locations import imm, RegisterLocation +from rpython.jit.backend.ppc.arch import IS_BIG_ENDIAN +from rpython.jit.backend.llsupport.vector_ext import VectorExt +from rpython.jit.backend.ppc.arch import PARAM_SAVE_AREA_OFFSET +import rpython.jit.backend.ppc.register as r +import rpython.jit.backend.ppc.condition as c +import rpython.jit.backend.ppc.locations as l +from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper +from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.jit.codewriter import longlong +from rpython.jit.backend.ppc.detect_feature import detect_vsx +from rpython.rlib.objectmodel import always_inline +def not_implemented(msg): + msg = '[zarch/vector_ext] %s\n' % msg + if we_are_translated(): + llop.debug_print(lltype.Void, msg) + raise NotImplementedError(msg) +def flush_vec_cc(asm, regalloc, condition, size, result_loc): + # After emitting an instruction that leaves a boolean result in + # a condition code (cc), call this. In the common case, result_loc + # will be set to SPP by the regalloc, which in this case means + # "propagate it between this operation and the next guard by keeping + # it in the cc". In the uncommon case, result_loc is another + # register, and we emit a load from the cc into this register. + + # Possibly invert the bit in the CR + #bit, invert = c.encoding[condition] + #assert 24 <= bit <= 27 + #if invert == 12: + # pass + #elif invert == 4: + # asm.mc.crnor(bit, bit, bit) + #else: + # assert 0 + #assert asm.guard_success_cc == c.cond_none + ## + #if result_loc is r.SPP: + # asm.guard_success_cc = condition + #else: + # resval = result_loc.value + # # either doubleword integer 1 (2x) or word integer 1 (4x) + # ones = regalloc.vrm.get_scratch_reg(type=INT).value + # zeros = regalloc.vrm.get_scratch_reg(type=INT).value + # asm.mc.vxor(zeros, zeros, zeros) + # if size == 4: + # asm.mc.vspltisw(ones, 1) + # else: + # assert size == 8 + # tloc = regalloc.rm.get_scratch_reg() + # asm.mc.load_imm(tloc, asm.VEC_DOUBLE_WORD_ONES) + # asm.mc.lvx(ones, 0, tloc.value) + # asm.mc.vsel(resval, zeros, ones, resval) + +class ZSIMDVectorExt(VectorExt): + def setup_once(self, asm): + if detect_simd_z(): + self.enable(16, accum=True) + asm.setup_once_vector() + self._setup = True + +class VectorAssembler(object): + _mixin_ = True + + # TODO VEC_DOUBLE_WORD_ONES = 0 + + def setup_once_vector(self): + # TODO if IS_BIG_ENDIAN: + # TODO # 2x 64 bit signed integer(1) BE + # TODO data = (b'\x00' * 7 + b'\x01') * 2 + # TODO else: + # TODO # 2x 64 bit signed integer(1) LE + # TODO data = (b'\x01' + b'\x00' * 7) * 2 + # TODO datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, []) + # TODO mem = datablockwrapper.malloc_aligned(len(data), alignment=16) + # TODO datablockwrapper.done() + # TODO addr = rffi.cast(rffi.CArrayPtr(lltype.Char), mem) + # TODO for i in range(len(data)): + # TODO addr[i] = data[i] + # TODO self.VEC_DOUBLE_WORD_ONES = mem + + def emit_vec_load_f(self, op, arglocs, regalloc): + resloc, baseloc, indexloc, size_loc, ofs, integer_loc = arglocs + indexloc = self._apply_offset(indexloc, ofs) + itemsize = size_loc.value + if integer_loc.value: + self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value) + elif itemsize == 4: + self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value) + elif itemsize == 8: + self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value) + else: + not_implemented("vec_load_f itemsize %d" % itemsize) + + emit_vec_load_i = emit_vec_load_f + + def emit_vec_store(self, op, arglocs, regalloc): + baseloc, indexloc, valueloc, sizeloc, baseofs, \ + integer_loc = arglocs + indexloc = self._apply_offset(indexloc, baseofs) + assert baseofs.value == 0 + if integer_loc.value: + self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value) + else: + itemsize = sizeloc.value + if itemsize == 4: + self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value) + elif itemsize == 8: + self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value) + else: + not_implemented("vec_store itemsize %d" % itemsize) + + def emit_vec_int_add(self, op, arglocs, regalloc): + resloc, loc0, loc1, size_loc = arglocs + size = size_loc.value + if size == 1: + self.mc.vaddubm(resloc.value, loc0.value, loc1.value) + elif size == 2: + self.mc.vadduhm(resloc.value, loc0.value, loc1.value) + elif size == 4: + self.mc.vadduwm(resloc.value, loc0.value, loc1.value) + elif size == 8: + self.mc.vaddudm(resloc.value, loc0.value, loc1.value) + + def emit_vec_int_sub(self, op, arglocs, regalloc): + resloc, loc0, loc1, size_loc = arglocs + size = size_loc.value + if size == 1: + # TODO verify if unsigned subtract is the wanted feature + self.mc.vsububm(resloc.value, loc0.value, loc1.value) + elif size == 2: + # TODO verify if unsigned subtract is the wanted feature + self.mc.vsubuhm(resloc.value, loc0.value, loc1.value) + elif size == 4: + # TODO verify if unsigned subtract is the wanted feature + self.mc.vsubuwm(resloc.value, loc0.value, loc1.value) + elif size == 8: + self.mc.vsubudm(resloc.value, loc0.value, loc1.value) + + def emit_vec_float_add(self, op, arglocs, regalloc): + resloc, loc0, loc1, itemsize_loc = arglocs + itemsize = itemsize_loc.value + if itemsize == 4: + self.mc.xvaddsp(resloc.value, loc0.value, loc1.value) + elif itemsize == 8: + self.mc.xvadddp(resloc.value, loc0.value, loc1.value) + + def emit_vec_float_sub(self, op, arglocs, regalloc): + resloc, loc0, loc1, itemsize_loc = arglocs + itemsize = itemsize_loc.value + if itemsize == 4: + self.mc.xvsubsp(resloc.value, loc0.value, loc1.value) + elif itemsize == 8: + self.mc.xvsubdp(resloc.value, loc0.value, loc1.value) + + def emit_vec_float_mul(self, op, arglocs, regalloc): + resloc, loc0, loc1, itemsize_loc = arglocs + itemsize = itemsize_loc.value + if itemsize == 4: + self.mc.xvmulsp(resloc.value, loc0.value, loc1.value) + elif itemsize == 8: + self.mc.xvmuldp(resloc.value, loc0.value, loc1.value) + + def emit_vec_float_truediv(self, op, arglocs, regalloc): + resloc, loc0, loc1, itemsize_loc = arglocs + itemsize = itemsize_loc.value + if itemsize == 4: + self.mc.xvdivsp(resloc.value, loc0.value, loc1.value) + elif itemsize == 8: + self.mc.xvdivdp(resloc.value, loc0.value, loc1.value) + + def emit_vec_int_mul(self, op, arglocs, regalloc): + raise NotImplementedError + pass # TODO + + def emit_vec_int_and(self, op, arglocs, regalloc): + resloc, loc0, loc1, sizeloc = arglocs + self.mc.vand(resloc.value, loc0.value, loc1.value) + + def emit_vec_int_or(self, op, arglocs, regalloc): + resloc, loc0, loc1, sizeloc = arglocs + self.mc.vor(resloc.value, loc0.value, loc1.value) + + def emit_vec_int_xor(self, op, arglocs, regalloc): + resloc, loc0, loc1, sizeloc = arglocs + self.mc.vxor(resloc.value, loc0.value, loc1.value) + + def emit_vec_int_signext(self, op, arglocs, regalloc): + resloc, loc0 = arglocs + # TODO + self.regalloc_mov(loc0, resloc) + + def emit_vec_float_abs(self, op, arglocs, regalloc): + resloc, argloc, sizeloc = arglocs + size = sizeloc.value + if size == 4: + self.mc.xvabssp(resloc.value, argloc.value) + elif size == 8: + self.mc.xvabsdp(resloc.value, argloc.value) + else: + not_implemented("float abs for size %d" % size) + + def emit_vec_float_neg(self, op, arglocs, regalloc): + resloc, argloc, sizeloc = arglocs + size = sizeloc.value + if size == 4: + self.mc.xvnegsp(resloc.value, argloc.value) + elif size == 8: + self.mc.xvnegdp(resloc.value, argloc.value) + else: + not_implemented("float neg for size %d" % size) + + def emit_vec_guard_true(self, guard_op, arglocs, regalloc): + self._emit_guard(guard_op, arglocs) + + def emit_vec_guard_false(self, guard_op, arglocs, regalloc): + self.guard_success_cc = c.negate(self.guard_success_cc) + self._emit_guard(guard_op, arglocs) + + def _update_at_exit(self, fail_locs, fail_args, faildescr, regalloc): + """ If accumulation is done in this loop, at the guard exit + some vector registers must be adjusted to yield the correct value + """ + if not isinstance(faildescr, ResumeGuardDescr): + return + accum_info = faildescr.rd_vector_info + while accum_info: + pos = accum_info.getpos_in_failargs() + scalar_loc = fail_locs[pos] + vector_loc = accum_info.location + # the upper elements will be lost if saved to the stack! + scalar_arg = accum_info.getoriginal() + if not scalar_loc.is_reg(): + scalar_loc = regalloc.force_allocate_reg(scalar_arg) + assert scalar_arg is not None + op = accum_info.accum_operation + self._accum_reduce(op, scalar_arg, vector_loc, scalar_loc) + accum_info = accum_info.next() + + def _accum_reduce(self, op, arg, accumloc, targetloc): + # Currently the accumulator can ONLY be the biggest + # 64 bit float/int + tgt = targetloc.value + acc = accumloc.value + if arg.type == FLOAT: + # r = (r[0]+r[1],r[0]+r[1]) + if IS_BIG_ENDIAN: + self.mc.xxpermdi(tgt, acc, acc, 0b00) + else: + self.mc.xxpermdi(tgt, acc, acc, 0b10) + if op == '+': + self.mc.xsadddp(tgt, tgt, acc) + elif op == '*': + self.mc.xsmuldp(tgt, tgt, acc) + else: + not_implemented("sum not implemented") + return + else: + assert arg.type == INT + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value) + self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET) + self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) + if op == '+': + self.mc.add(tgt, tgt, acc) + elif op == '*': + self.mc.mulld(tgt, tgt, acc) + else: + not_implemented("sum not implemented") + return + + not_implemented("reduce sum for %s not impl." % arg) + + def emit_vec_int_is_true(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, argloc, sizeloc = arglocs + size = sizeloc.value + tmp = regalloc.vrm.get_scratch_reg(type=INT).value + self.mc.vxor(tmp, tmp, tmp) + # argloc[i] > 0: + # For an unsigned integer that is equivalent to argloc[i] != 0 + if size == 1: + self.mc.vcmpgtubx(resloc.value, argloc.value, tmp) + elif size == 2: + self.mc.vcmpgtuhx(resloc.value, argloc.value, tmp) + elif size == 4: + self.mc.vcmpgtuwx(resloc.value, argloc.value, tmp) + elif size == 8: + self.mc.vcmpgtudx(resloc.value, argloc.value, tmp) + flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc) + + def emit_vec_float_eq(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, loc1, loc2, sizeloc = arglocs + size = sizeloc.value + tmp = regalloc.vrm.get_scratch_reg().value + offloc = regalloc.rm.get_scratch_reg() + off = offloc.value + # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 + self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + if size == 4: + self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value) + self.mc.stxvw4x(tmp, off, r.SP.value) + elif size == 8: + self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value) + self.mc.stxvd2x(tmp, off, r.SP.value) + else: + not_implemented("[ppc/assembler] float == for size %d" % size) + self.mc.lvx(resloc.value, off, r.SP.value) + flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc) + + def emit_vec_float_xor(self, op, arglocs, regalloc): + resloc, l0, l1, sizeloc = arglocs + res = resloc.value + r0 = l0.value + r1 = l1.value + self.mc.xxlxor(res, r0, r1) + + def emit_vec_float_ne(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, loc1, loc2, sizeloc = arglocs + size = sizeloc.value + tmp = regalloc.vrm.get_scratch_reg().value + offloc = regalloc.rm.get_scratch_reg() + off = offloc.value + # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 + self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + if size == 4: + self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value) + self.mc.stxvw4x(tmp, off, r.SP.value) + elif size == 8: + self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value) + self.mc.stxvd2x(tmp, off, r.SP.value) + else: + not_implemented("float == for size %d" % size) + res = resloc.value + self.mc.lvx(res, off, r.SP.value) + self.mc.vnor(res, res, res) # complement + flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc) + + def emit_vec_cast_int_to_float(self, op, arglocs, regalloc): + res, l0 = arglocs + offloc = regalloc.rm.get_scratch_reg() + off = offloc.value + # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 + self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(l0.value, off, r.SP.value) + self.mc.lxvd2x(res.value, off, r.SP.value) + self.mc.xvcvsxddp(res.value, res.value) + + def emit_vec_int_eq(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + res, l0, l1, sizeloc = arglocs + size = sizeloc.value + if size == 1: + self.mc.vcmpequbx(res.value, l0.value, l1.value) + elif size == 2: + self.mc.vcmpequhx(res.value, l0.value, l1.value) + elif size == 4: + self.mc.vcmpequwx(res.value, l0.value, l1.value) + elif size == 8: + self.mc.vcmpequdx(res.value, l0.value, l1.value) + flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res) + + def emit_vec_int_ne(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + res, l0, l1, sizeloc = arglocs + size = sizeloc.value + tmp = regalloc.vrm.get_scratch_reg(type=INT).value + self.mc.vxor(tmp, tmp, tmp) + if size == 1: + self.mc.vcmpequbx(res.value, res.value, tmp) + elif size == 2: + self.mc.vcmpequhx(res.value, res.value, tmp) + elif size == 4: + self.mc.vcmpequwx(res.value, res.value, tmp) + elif size == 8: + self.mc.vcmpequdx(res.value, res.value, tmp) + self.mc.vnor(res.value, res.value, res.value) + flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res) + + def emit_vec_expand_f(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, srcloc = arglocs + size = op.bytesize + res = resloc.value + if isinstance(srcloc, l.ConstFloatLoc): + # they are aligned! + assert size == 8 + tloc = regalloc.rm.get_scratch_reg() + self.mc.load_imm(tloc, srcloc.value) + self.mc.lxvd2x(res, 0, tloc.value) + elif size == 8: + # splat the low of src to both slots in res + src = srcloc.value + self.mc.xxspltdl(res, src, src) + else: + not_implemented("vec expand in this combination not supported") + + def emit_vec_expand_i(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + res, l0, off = arglocs + size = op.bytesize + + self.mc.load_imm(r.SCRATCH2, off.value) + self.mc.lvx(res.value, r.SCRATCH2.value, r.SP.value) + if size == 1: + if IS_BIG_ENDIAN: + self.mc.vspltb(res.value, res.value, 0b0000) + else: + self.mc.vspltb(res.value, res.value, 0b1111) + elif size == 2: + if IS_BIG_ENDIAN: + self.mc.vsplth(res.value, res.value, 0b000) + else: + self.mc.vsplth(res.value, res.value, 0b111) + elif size == 4: + if IS_BIG_ENDIAN: + self.mc.vspltw(res.value, res.value, 0b00) + else: + self.mc.vspltw(res.value, res.value, 0b11) + elif size == 8: + pass + else: + not_implemented("expand int size not impl") + + def emit_vec_pack_i(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resultloc, vloc, sourceloc, residxloc, srcidxloc, countloc = arglocs + srcidx = srcidxloc.value + residx = residxloc.value + count = countloc.value + res = resultloc.value + vector = vloc.value + src = sourceloc.value + size = op.bytesize + assert resultloc.is_vector_reg() # vector <- reg + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(vector, r.SCRATCH2.value, r.SP.value) + idx = residx + if size == 8: + if not IS_BIG_ENDIAN: + idx = (16 // size) - 1 - idx + self.mc.store(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+8*idx) + elif size == 4: + for j in range(count): + idx = j + residx + if not IS_BIG_ENDIAN: + idx = (16 // size) - 1 - idx + self.mc.stw(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+4*idx) + elif size == 2: + for j in range(count): + idx = j + residx + if not IS_BIG_ENDIAN: + idx = (16 // size) - 1 - idx + self.mc.sth(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+2*idx) + elif size == 1: + for j in range(count): + idx = j + residx + if not IS_BIG_ENDIAN: + idx = (16 // size) - 1 - idx + self.mc.stb(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+idx) + self.mc.lvx(res, r.SCRATCH2.value, r.SP.value) + + def emit_vec_unpack_i(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, srcloc, idxloc, countloc, sizeloc = arglocs + idx = idxloc.value + res = resloc.value + src = srcloc.value + size = sizeloc.value + count = countloc.value + if count == 1: + assert srcloc.is_vector_reg() + assert not resloc.is_vector_reg() + off = PARAM_SAVE_AREA_OFFSET + self.mc.load_imm(r.SCRATCH2, off) + self.mc.stvx(src, r.SCRATCH2.value, r.SP.value) + if not IS_BIG_ENDIAN: + idx = (16 // size) - 1 - idx + off += size * idx + if size == 8: + self.mc.load(res, r.SP.value, off) + return + elif size == 4: + self.mc.lwa(res, r.SP.value, off) + return + elif size == 2: + self.mc.lha(res, r.SP.value, off) + return + elif size == 1: + self.mc.lbz(res, r.SP.value, off) + self.mc.extsb(res, res) + return + else: + # count is not 1, but only 2 is supported for i32 + # 4 for i16 and 8 for i8. + src = srcloc.value + res = resloc.value + + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(src, r.SCRATCH2.value, r.SP.value) + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET+16) + self.mc.stvx(res, r.SCRATCH2.value, r.SP.value) + if count * size == 8: + if not IS_BIG_ENDIAN: + endian_off = 8 + off = PARAM_SAVE_AREA_OFFSET + off = off + endian_off - (idx * size) + assert idx * size + 8 <= 16 + self.mc.load(r.SCRATCH.value, r.SP.value, off) + self.mc.store(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+16+endian_off) + self.mc.lvx(res, r.SCRATCH2.value, r.SP.value) + return + + not_implemented("%d bit integer, count %d" % \ + (size*8, count)) + + def emit_vec_pack_f(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, vloc, srcloc, residxloc, srcidxloc, countloc = arglocs + vec = vloc.value + res = resloc.value + src = srcloc.value + count = countloc.value + residx = residxloc.value + srcidx = srcidxloc.value + size = op.bytesize + # srcloc is always a floating point register f, this means it is + # vsr[0] == valueof(f) + if srcidx == 0: + if residx == 0: + # r = (s[0], v[1]) + self.mc.xxpermdi(res, src, vec, permi(0,1)) + else: + assert residx == 1 + # r = (v[0], s[0]) + self.mc.xxpermdi(res, vec, src, permi(1,1)) + else: + assert srcidx == 1 + if residx == 0: + # r = (s[1], v[1]) + self.mc.xxpermdi(res, src, vec, permi(1,1)) + else: + assert residx == 1 + # r = (v[0], s[1]) + self.mc.xxpermdi(res, vec, src, permi(0,1)) + + def emit_vec_unpack_f(self, op, arglocs, regalloc): + assert isinstance(op, VectorOp) + resloc, srcloc, srcidxloc, countloc = arglocs + res = resloc.value + src = srcloc.value + srcidx = srcidxloc.value + size = op.bytesize + # srcloc is always a floating point register f, this means it is + # vsr[0] == valueof(f) + if srcidx == 0: + # r = (s[0], s[1]) + self.mc.xxpermdi(res, src, src, permi(0,1)) + return + else: + # r = (s[1], s[0]) + self.mc.xxpermdi(res, src, src, permi(1,0)) + return + not_implemented("unpack for combination src %d -> res %d" % (srcidx, residx)) + + def emit_vec_cast_float_to_int(self, op, arglocs, regalloc): + res, l0 = arglocs + offloc = regalloc.rm.get_scratch_reg() + v0 = regalloc.vrm.get_scratch_reg(type=INT) + off = offloc.value + # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 + self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + self.mc.xvcvdpsxds(res.value, l0.value) + + # needed as soon as PPC's support_singlefloat is implemented! + #def genop_vec_cast_singlefloat_to_float(self, op, arglocs, regalloc): + # self.mc.CVTPS2PD(resloc, arglocs[0]) + + def emit_vec_f(self, op, arglocs, regalloc): + pass + emit_vec_i = emit_vec_f + +class VectorRegalloc(object): + _mixin_ = True + + def force_allocate_vector_reg(self, op): + forbidden_vars = self.vrm.temp_boxes + return self.vrm.force_allocate_reg(op, forbidden_vars) + + def force_allocate_vector_reg_or_cc(self, op): + assert op.type == INT + if self.next_op_can_accept_cc(self.operations, self.rm.position): + # hack: return the SPP location to mean "lives in CC". This + # SPP will not actually be used, and the location will be freed + # after the next op as usual. + self.rm.force_allocate_frame_reg(op) + return r.SPP + else: + return self.force_allocate_vector_reg(op) + + def ensure_vector_reg(self, box): + return self.vrm.make_sure_var_in_reg(box, + forbidden_vars=self.vrm.temp_boxes) + + def _prepare_load(self, op): + descr = op.getdescr() + assert isinstance(descr, ArrayDescr) + assert not descr.is_array_of_pointers() and \ + not descr.is_array_of_structs() + itemsize, ofs, _ = unpack_arraydescr(descr) + integer = not (descr.is_array_of_floats() or descr.getconcrete_type() == FLOAT) + args = op.getarglist() + a0 = op.getarg(0) + a1 = op.getarg(1) + base_loc = self.ensure_reg(a0) + ofs_loc = self.ensure_reg(a1) + result_loc = self.force_allocate_vector_reg(op) + return [result_loc, base_loc, ofs_loc, imm(itemsize), imm(ofs), + imm(integer)] + + prepare_vec_load_i = _prepare_load + prepare_vec_load_f = _prepare_load + + def prepare_vec_arith(self, op): + assert isinstance(op, VectorOp) + a0 = op.getarg(0) + a1 = op.getarg(1) + size = op.bytesize + args = op.getarglist() + loc0 = self.ensure_vector_reg(a0) + loc1 = self.ensure_vector_reg(a1) + resloc = self.force_allocate_vector_reg(op) + return [resloc, loc0, loc1, imm(size)] + + prepare_vec_int_add = prepare_vec_arith + prepare_vec_int_sub = prepare_vec_arith + prepare_vec_int_mul = prepare_vec_arith + prepare_vec_float_add = prepare_vec_arith + prepare_vec_float_sub = prepare_vec_arith + prepare_vec_float_mul = prepare_vec_arith + prepare_vec_float_truediv = prepare_vec_arith + + # logic functions + prepare_vec_int_and = prepare_vec_arith + prepare_vec_int_or = prepare_vec_arith + prepare_vec_int_xor = prepare_vec_arith + prepare_vec_float_xor = prepare_vec_arith + del prepare_vec_arith + + def prepare_vec_bool(self, op): + assert isinstance(op, VectorOp) + a0 = op.getarg(0) + a1 = op.getarg(1) + size = op.bytesize + args = op.getarglist() + loc0 = self.ensure_vector_reg(a0) + loc1 = self.ensure_vector_reg(a1) + resloc = self.force_allocate_vector_reg_or_cc(op) + return [resloc, loc0, loc1, imm(size)] + + prepare_vec_float_eq = prepare_vec_bool + prepare_vec_float_ne = prepare_vec_bool + prepare_vec_int_eq = prepare_vec_bool + prepare_vec_int_ne = prepare_vec_bool + del prepare_vec_bool + + def prepare_vec_store(self, op): + descr = op.getdescr() + assert isinstance(descr, ArrayDescr) + assert not descr.is_array_of_pointers() and \ + not descr.is_array_of_structs() + itemsize, ofs, _ = unpack_arraydescr(descr) + a0 = op.getarg(0) + a1 = op.getarg(1) + a2 = op.getarg(2) + baseloc = self.ensure_reg(a0) + ofsloc = self.ensure_reg(a1) + valueloc = self.ensure_vector_reg(a2) + + integer = not (descr.is_array_of_floats() or descr.getconcrete_type() == FLOAT) + return [baseloc, ofsloc, valueloc, + imm(itemsize), imm(ofs), imm(integer)] + + def prepare_vec_int_signext(self, op): + assert isinstance(op, VectorOp) + a0 = op.getarg(0) + loc0 = self.ensure_vector_reg(a0) + resloc = self.force_allocate_vector_reg(op) + return [resloc, loc0] + + def prepare_vec_arith_unary(self, op): + assert isinstance(op, VectorOp) + a0 = op.getarg(0) + loc0 = self.ensure_vector_reg(a0) + resloc = self.force_allocate_vector_reg(op) + sizeloc = imm(op.bytesize) + return [resloc, loc0, sizeloc] + + prepare_vec_float_neg = prepare_vec_arith_unary + prepare_vec_float_abs = prepare_vec_arith_unary + del prepare_vec_arith_unary + + def prepare_vec_pack_i(self, op): + # new_res = vec_pack_i(res, src, index, count) + assert isinstance(op, VectorOp) + arg = op.getarg(1) + index = op.getarg(2) + count = op.getarg(3) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + vloc = self.ensure_vector_reg(op.getarg(0)) + srcloc = self.ensure_reg(arg) + resloc = self.force_allocate_vector_reg(op) + residx = index.value # where to put it in result? + srcidx = 0 + return [resloc, vloc, srcloc, imm(residx), imm(srcidx), imm(count.value)] + + def prepare_vec_pack_f(self, op): + # new_res = vec_pack_f(res, src, index, count) + assert isinstance(op, VectorOp) + arg = op.getarg(1) + index = op.getarg(2) + count = op.getarg(3) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + assert not arg.is_vector() + srcloc = self.ensure_reg(arg) + vloc = self.ensure_vector_reg(op.getarg(0)) + if op.is_vector(): + resloc = self.force_allocate_vector_reg(op) + else: + resloc = self.force_allocate_reg(op) + residx = index.value # where to put it in result? + srcidx = 0 + return [resloc, vloc, srcloc, imm(residx), imm(srcidx), imm(count.value)] + + def prepare_vec_unpack_f(self, op): + index = op.getarg(1) + count = op.getarg(2) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + srcloc = self.ensure_vector_reg(op.getarg(0)) + resloc = self.force_allocate_reg(op) + return [resloc, srcloc, imm(index.value), imm(count.value)] + + def prepare_vec_unpack_i(self, op): + assert isinstance(op, VectorOp) + index = op.getarg(1) + count = op.getarg(2) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + arg = op.getarg(0) + if arg.is_vector(): + srcloc = self.ensure_vector_reg(arg) + else: + # unpack + srcloc = self.ensure_reg(arg0) + size = arg.bytesize + if op.is_vector(): + resloc = self.force_allocate_vector_reg(op) + else: + resloc = self.force_allocate_reg(op) + return [resloc, srcloc, imm(index.value), imm(count.value), imm(size)] + + def expand_float(self, size, box): + adr = self.assembler.datablockwrapper.malloc_aligned(16, 16) + fs = box.getfloatstorage() + rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[0] = fs + rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = fs + return l.ConstFloatLoc(adr) + + def prepare_vec_expand_f(self, op): + assert isinstance(op, VectorOp) + arg = op.getarg(0) + if arg.is_constant(): + l0 = self.expand_float(op.bytesize, arg) + res = self.force_allocate_vector_reg(op) + else: + l0 = self.ensure_reg(arg) + res = self.force_allocate_vector_reg(op) + return [res, l0] + + def prepare_vec_expand_i(self, op): + assert isinstance(op, VectorOp) + arg = op.getarg(0) + mc = self.assembler.mc + if arg.is_constant(): + assert isinstance(arg, ConstInt) + l0 = self.rm.get_scratch_reg() + mc.load_imm(l0, arg.value) + else: + l0 = self.ensure_reg(arg) + mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET) + size = op.bytesize + if size == 8: + mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) + res = self.force_allocate_vector_reg(op) + return [res, l0, imm(PARAM_SAVE_AREA_OFFSET)] + + def prepare_vec_int_is_true(self, op): + assert isinstance(op, VectorOp) + arg = op.getarg(0) + assert isinstance(arg, VectorOp) + argloc = self.ensure_vector_reg(arg) + resloc = self.force_allocate_vector_reg_or_cc(op) + return [resloc, argloc, imm(arg.bytesize)] + + def _prepare_vec(self, op): + # pseudo instruction, needed to allocate a register for a new variable + return [self.force_allocate_vector_reg(op)] + + prepare_vec_i = _prepare_vec + prepare_vec_f = _prepare_vec + + def prepare_vec_cast_float_to_int(self, op): + l0 = self.ensure_vector_reg(op.getarg(0)) + res = self.force_allocate_vector_reg(op) + return [res, l0] + + prepare_vec_cast_int_to_float = prepare_vec_cast_float_to_int + + def prepare_vec_guard_true(self, op): + self.assembler.guard_success_cc = c.VEQ + return self._prepare_guard(op) + + def prepare_vec_guard_false(self, op): + self.assembler.guard_success_cc = c.VNE _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit