Author: Richard Plangger <planri...@gmail.com> Branch: zarch-simd-support Changeset: r87107:4c3e946b5c05 Date: 2016-09-14 16:19 +0200 http://bitbucket.org/pypy/pypy/changeset/4c3e946b5c05/
Log: more vector ops, int->float, comparison float diff --git a/rpython/jit/backend/zarch/assembler.py b/rpython/jit/backend/zarch/assembler.py --- a/rpython/jit/backend/zarch/assembler.py +++ b/rpython/jit/backend/zarch/assembler.py @@ -919,6 +919,10 @@ return frame_depth def regalloc_mov(self, prev_loc, loc): + if prev_loc.is_vector_reg(): + assert loc.is_vector_reg() + self.mc.VLR(loc, prev_loc) + return if prev_loc.is_imm(): value = prev_loc.getint() # move immediate value to register diff --git a/rpython/jit/backend/zarch/conditions.py b/rpython/jit/backend/zarch/conditions.py --- a/rpython/jit/backend/zarch/conditions.py +++ b/rpython/jit/backend/zarch/conditions.py @@ -18,7 +18,7 @@ # normal branch instructions FLOAT = ConditionLocation(0x10) -EQ = ConditionLocation(0x8) +VEQI = EQ = ConditionLocation(0x8) LT = ConditionLocation(0x4) GT = ConditionLocation(0x2) OF = ConditionLocation(0x1) # overflow @@ -27,7 +27,7 @@ FLE = ConditionLocation(EQ.value | LT.value) GE = ConditionLocation(EQ.value | GT.value | OF.value) FGE = ConditionLocation(EQ.value | GT.value) -NE = ConditionLocation(LT.value | GT.value | OF.value) +VNEI = NE = ConditionLocation(LT.value | GT.value | OF.value) NO = ConditionLocation(0xe) # NO overflow FGT = ConditionLocation(GT.value | OF.value) diff --git a/rpython/jit/backend/zarch/instruction_builder.py b/rpython/jit/backend/zarch/instruction_builder.py --- a/rpython/jit/backend/zarch/instruction_builder.py +++ b/rpython/jit/backend/zarch/instruction_builder.py @@ -468,27 +468,27 @@ byte = (v1 & BIT_MASK_4) << 4 | (idx & BIT_MASK_4) self.writechar(chr(byte)) encode_base_displace(self, bid) - self.writechar(chr((mask & BIT_MASK_4 << 4) | (rbx & BIT_MASK_4))) + self.writechar(chr((mask & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) self.writechar(opcode2) return encode_vrx -def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v'): +def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v,m,m,m'): @builder.arguments(argtypes) - def encode_vrr_a(self, v1, v2): + def encode_vrr_a(self, v1, v2, mask3=0, mask4=0, mask5=0): self.writechar(opcode1) rbx = (v1 >= 16) << 3 rbx |= (v2 >= 16) << 2 byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4) self.writechar(chr(byte)) self.writechar(chr(0)) - self.writechar(chr(0)) - self.writechar(chr(rbx & BIT_MASK_4)) + self.writechar(chr((mask5 & BIT_MASK_4) << 4 | (mask4 & BIT_MASK_4))) + self.writechar(chr((mask3 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) self.writechar(opcode2) return encode_vrr_a -def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'): +def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m,m'): @builder.arguments(argtypes) - def encode_vrr_c(self, v1, v2, v3, mask1=0, mask2=0): + def encode_vrr_c(self, v1, v2, v3, mask4=0, mask5=0, mask6=0): self.writechar(opcode1) rbx = (v1 >= 16) << 3 rbx |= (v2 >= 16) << 2 @@ -497,11 +497,40 @@ self.writechar(chr(byte)) byte = (v3 & BIT_MASK_4) << 4 self.writechar(chr(byte)) - self.writechar(chr(mask2 & BIT_MASK_4)) - self.writechar(chr((mask1 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) + self.writechar(chr((mask6 & BIT_MASK_4) << 4 | (mask5 & BIT_MASK_4))) + self.writechar(chr((mask4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) self.writechar(opcode2) return encode_vrr_c +def build_vrr_e(mnemonic, (opcode1,opcode2), argtypes='v,v,v,v,m,m'): + @builder.arguments(argtypes) + def encode_vrr_e(self, v1, v2, v3, v4, mask5=0, mask6=0): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + rbx |= (v2 >= 16) << 2 + rbx |= (v3 >= 16) << 1 + rbx |= (v4 >= 16) + byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4) + self.writechar(chr(byte)) + byte = (v3 & BIT_MASK_4) << 4 | (mask6 & BIT_MASK_4) << 4 + self.writechar(chr(byte)) + self.writechar(chr((mask5 & BIT_MASK_4))) + self.writechar(chr((v4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) + self.writechar(opcode2) + return encode_vrr_e + +def build_vri_a(mnemonic, (opcode1,opcode2), argtypes='v,i16,m'): + @builder.arguments(argtypes) + def encode_vri_a(self, v1, i2, mask3): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + byte = (v1 & BIT_MASK_4) << 4 + self.writechar(chr(byte)) + self.write_i16(i2 & BIT_MASK_16) + self.writechar(chr((mask3 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) + self.writechar(opcode2) + return encode_vri_a + def build_unpack_func(mnemonic, func): @always_inline @@ -555,13 +584,12 @@ return arg unpack_arg._annspecialcase_ = 'specialize:arg(1)' argtypes = func._arguments_[:] - #while len(argtypes) > 0 and argtypes[-1] == '-': - # argtypes.pop() at = argtypes[0] if len(argtypes) >= 1 else '-' bt = argtypes[1] if len(argtypes) >= 2 else '-' ct = argtypes[2] if len(argtypes) >= 3 else '-' dt = argtypes[3] if len(argtypes) >= 4 else '-' et = argtypes[4] if len(argtypes) >= 5 else '-' + ft = argtypes[5] if len(argtypes) >= 6 else '-' def function0(self): return func(self) def function1(self, a): @@ -601,6 +629,14 @@ i = unpack_arg(d, dt) j = unpack_arg(e, et) return func(self, f, g, h, i, j) + def function6(self, a, b, c, d, e, f): + g = unpack_arg(a, at) + h = unpack_arg(b, bt) + i = unpack_arg(c, ct) + j = unpack_arg(d, dt) + k = unpack_arg(e, et) + l = unpack_arg(f, ft) + return func(self, g, h, i, j, k, l) if len(argtypes) == 0: function = function0 elif len(argtypes) == 1: @@ -622,6 +658,8 @@ function = function4_last_default elif len(argtypes) == 5: function = function5 + elif len(argtypes) == 6: + function = function6 else: assert 0, "implement function for argtypes %s" % (argtypes,) function.__name__ = mnemonic diff --git a/rpython/jit/backend/zarch/instructions.py b/rpython/jit/backend/zarch/instructions.py --- a/rpython/jit/backend/zarch/instructions.py +++ b/rpython/jit/backend/zarch/instructions.py @@ -295,7 +295,7 @@ vector_mnemonic_codes = { 'VL': ('vrx', ['\xE7','\x06'], 'v,bid'), - 'VLR': ('vrr_a', ['\xE7','\x56']), + 'VLR': ('vrr_a', ['\xE7','\x56'], 'v,v'), 'VST': ('vrx', ['\xE7','\x0E'], 'v,bid'), @@ -316,6 +316,16 @@ 'VFM': ('vrr_c', ['\xE7','\xE7']), 'VFD': ('vrr_c', ['\xE7','\xE5']), + # conversion + 'VCDG': ('vrr_a', ['\xE7','\xC3']), + + # compare, sign, ... + 'VFPSO': ('vrr_a', ['\xE7','\xCC']), + 'VFCE': ('vrr_c', ['\xE7','\xE8']), + 'VSEL': ('vrr_e', ['\xE7','\x8D'], 'v,v,v,v'), + 'VPERM': ('vrr_e', ['\xE7','\x8C'], 'v,v,v,v'), + 'VREPI': ('vri_a', ['\xE7','\x45']), + # '': ('', ['','']), } diff --git a/rpython/jit/backend/zarch/locations.py b/rpython/jit/backend/zarch/locations.py --- a/rpython/jit/backend/zarch/locations.py +++ b/rpython/jit/backend/zarch/locations.py @@ -25,6 +25,9 @@ def is_fp_reg(self): return False + def is_vector_reg(self): + return False + def is_imm_float(self): return False @@ -118,13 +121,10 @@ def is_core_reg(self): return False - def is_fp_reg(self): - return True - def as_key(self): # 16 <= as_key <= 32 return self.value + 32 - def is_float(self): + def is_vector_reg(self): return True class ImmLocation(AssemblerLocation): diff --git a/rpython/jit/backend/zarch/regalloc.py b/rpython/jit/backend/zarch/regalloc.py --- a/rpython/jit/backend/zarch/regalloc.py +++ b/rpython/jit/backend/zarch/regalloc.py @@ -54,6 +54,12 @@ def __repr__(self): return "<TempFloat at %s>" % (id(self),) +class TempVector(TempVar): + type = 'V' + + def __repr__(self): + return "<TempVector at %s>" % (id(self),) + class FPRegisterManager(RegisterManager): all_regs = r.MANAGED_FP_REGS @@ -142,8 +148,7 @@ return loc def get_scratch_reg(self, selected_reg=None): - # TODO - box = TempFloat() + box = TempVector() reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes, selected_reg=selected_reg) self.temp_boxes.append(box) return reg diff --git a/rpython/jit/backend/zarch/vector_ext.py b/rpython/jit/backend/zarch/vector_ext.py --- a/rpython/jit/backend/zarch/vector_ext.py +++ b/rpython/jit/backend/zarch/vector_ext.py @@ -15,6 +15,7 @@ import rpython.jit.backend.zarch.registers as r import rpython.jit.backend.zarch.conditions as c import rpython.jit.backend.zarch.locations as l +import rpython.jit.backend.zarch.masks as m from rpython.jit.backend.zarch.locations import imm from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper from rpython.rtyper.lltypesystem import lltype, rffi @@ -27,42 +28,22 @@ llop.debug_print(lltype.Void, msg) raise NotImplementedError(msg) -def flush_vec_cc(asm, regalloc, condition, size, result_loc): +def flush_vec_cc(asm, regalloc, condition, size, resultloc): # After emitting an instruction that leaves a boolean result in - # a condition code (cc), call this. In the common case, result_loc + # a condition code (cc), call this. In the common case, resultloc # will be set to SPP by the regalloc, which in this case means # "propagate it between this operation and the next guard by keeping - # it in the cc". In the uncommon case, result_loc is another + # it in the cc". In the uncommon case, resultloc is another # register, and we emit a load from the cc into this register. - # Possibly invert the bit in the CR - #bit, invert = c.encoding[condition] - #assert 24 <= bit <= 27 - #if invert == 12: - # pass - #elif invert == 4: - # asm.mc.crnor(bit, bit, bit) - #else: - # assert 0 - #assert asm.guard_success_cc == c.cond_none - ## - #if result_loc is r.SPP: - # asm.guard_success_cc = condition - #else: - # resval = result_loc.value - # # either doubleword integer 1 (2x) or word integer 1 (4x) - # ones = regalloc.vrm.get_scratch_reg(type=INT).value - # zeros = regalloc.vrm.get_scratch_reg(type=INT).value - # asm.mc.vxor(zeros, zeros, zeros) - # if size == 4: - # asm.mc.vspltisw(ones, 1) - # else: - # assert size == 8 - # tloc = regalloc.rm.get_scratch_reg() - # asm.mc.load_imm(tloc, asm.VEC_DOUBLE_WORD_ONES) - # asm.mc.lvx(ones, 0, tloc.value) - # asm.mc.vsel(resval, zeros, ones, resval) - pass + if resultloc is r.SPP: + asm.guard_success_cc = condition + else: + ones = regalloc.vrm.get_scratch_reg() + zeros = regalloc.vrm.get_scratch_reg() + asm.mc.VX(zeros, zeros, zeros) + asm.mc.VREPI(ones, l.imm(1), l.itemsize_to_mask(size)) + asm.mc.VSEL(resultloc, ones, zeros, resultloc) class ZSIMDVectorExt(VectorExt): def setup_once(self, asm): @@ -119,7 +100,7 @@ resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value if itemsize == 8: - self.mc.VFA(resloc, loc0, loc1, 3, 0) + self.mc.VFA(resloc, loc0, loc1, 3, 0, 0) return not_implemented("vec_float_add of size %d" % itemsize) @@ -127,7 +108,7 @@ resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value if itemsize == 8: - self.mc.VFS(resloc, loc0, loc1, 3, 0) + self.mc.VFS(resloc, loc0, loc1, 3, 0, 0) return not_implemented("vec_float_sub of size %d" % itemsize) @@ -135,7 +116,7 @@ resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value if itemsize == 8: - self.mc.VFM(resloc, loc0, loc1, 3, 0) + self.mc.VFM(resloc, loc0, loc1, 3, 0, 0) return not_implemented("vec_float_mul of size %d" % itemsize) @@ -143,7 +124,7 @@ resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value if itemsize == 8: - self.mc.VFD(resloc, loc0, loc1, 3, 0) + self.mc.VFD(resloc, loc0, loc1, 3, 0, 0) return not_implemented("vec_float_truediv of size %d" % itemsize) @@ -168,22 +149,18 @@ def emit_vec_float_abs(self, op, arglocs, regalloc): resloc, argloc, sizeloc = arglocs size = sizeloc.value - if size == 4: - self.mc.xvabssp(resloc.value, argloc.value) - elif size == 8: - self.mc.xvabsdp(resloc.value, argloc.value) - else: - not_implemented("float abs for size %d" % size) + if size == 8: + self.mc.VFPSO(resloc, argloc, 3, 0, 2) + return + not_implemented("vec_float_abs of size %d" % itemsize) def emit_vec_float_neg(self, op, arglocs, regalloc): resloc, argloc, sizeloc = arglocs size = sizeloc.value - if size == 4: - self.mc.xvnegsp(resloc.value, argloc.value) - elif size == 8: - self.mc.xvnegdp(resloc.value, argloc.value) - else: - not_implemented("float neg for size %d" % size) + if size == 8: + self.mc.VFPSO(resloc, argloc, 3, 0, 0) + return + not_implemented("vec_float_abs of size %d" % itemsize) def emit_vec_guard_true(self, guard_op, arglocs, regalloc): self._emit_guard(guard_op, arglocs) @@ -212,42 +189,9 @@ self._accum_reduce(op, scalar_arg, vector_loc, scalar_loc) accum_info = accum_info.next() - def _accum_reduce(self, op, arg, accumloc, targetloc): - # Currently the accumulator can ONLY be the biggest - # 64 bit float/int - tgt = targetloc.value - acc = accumloc.value - if arg.type == FLOAT: - # r = (r[0]+r[1],r[0]+r[1]) - if IS_BIG_ENDIAN: - self.mc.xxpermdi(tgt, acc, acc, 0b00) - else: - self.mc.xxpermdi(tgt, acc, acc, 0b10) - if op == '+': - self.mc.xsadddp(tgt, tgt, acc) - elif op == '*': - self.mc.xsmuldp(tgt, tgt, acc) - else: - not_implemented("sum not implemented") - return - else: - assert arg.type == INT - self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) - self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value) - self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET) - self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) - if op == '+': - self.mc.add(tgt, tgt, acc) - elif op == '*': - self.mc.mulld(tgt, tgt, acc) - else: - not_implemented("sum not implemented") - return - - not_implemented("reduce sum for %s not impl." % arg) - def emit_vec_int_is_true(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) + # TODO resloc, argloc, sizeloc = arglocs size = sizeloc.value tmp = regalloc.vrm.get_scratch_reg(type=INT).value @@ -266,22 +210,13 @@ def emit_vec_float_eq(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) - resloc, loc1, loc2, sizeloc = arglocs + resloc, loc0, loc1, sizeloc = arglocs size = sizeloc.value - tmp = regalloc.vrm.get_scratch_reg().value - offloc = regalloc.rm.get_scratch_reg() - off = offloc.value - # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - if size == 4: - self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value) - self.mc.stxvw4x(tmp, off, r.SP.value) - elif size == 8: - self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value) - self.mc.stxvd2x(tmp, off, r.SP.value) + if size == 8: + # bit 3 in last argument sets the condition code + self.mc.VFCE(resloc, loc0, loc1, 3, 0, 1) else: not_implemented("[zarch/assembler] float == for size %d" % size) - self.mc.lvx(resloc.value, off, r.SP.value) flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc) def emit_vec_float_xor(self, op, arglocs, regalloc): @@ -314,14 +249,16 @@ flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc) def emit_vec_cast_int_to_float(self, op, arglocs, regalloc): - res, l0 = arglocs + resloc, loc0 = arglocs offloc = regalloc.rm.get_scratch_reg() off = offloc.value # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - self.mc.stvx(l0.value, off, r.SP.value) - self.mc.lxvd2x(res.value, off, r.SP.value) - self.mc.xvcvsxddp(res.value, res.value) + # bit 1 on mask4 -> supresses inexact exception + self.mc.VCDG(resloc, loc0, 3, 4, m.RND_TOZERO.value) + #self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + #self.mc.stvx(l0.value, off, r.SP.value) + #self.mc.lxvd2x(res.value, off, r.SP.value) + #self.mc.xvcvsxddp(res.value, res.value) def emit_vec_int_eq(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) @@ -354,6 +291,15 @@ self.mc.vnor(res.value, res.value, res.value) flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res) + def emit_vec_cast_float_to_int(self, op, arglocs, regalloc): + res, l0 = arglocs + offloc = regalloc.rm.get_scratch_reg() + v0 = regalloc.vrm.get_scratch_reg(type=INT) + off = offloc.value + # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 + self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) + self.mc.xvcvdpsxds(res.value, l0.value) + def emit_vec_expand_f(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) resloc, srcloc = arglocs @@ -540,14 +486,40 @@ return not_implemented("unpack for combination src %d -> res %d" % (srcidx, residx)) - def emit_vec_cast_float_to_int(self, op, arglocs, regalloc): - res, l0 = arglocs - offloc = regalloc.rm.get_scratch_reg() - v0 = regalloc.vrm.get_scratch_reg(type=INT) - off = offloc.value - # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - self.mc.xvcvdpsxds(res.value, l0.value) + def _accum_reduce(self, op, arg, accumloc, targetloc): + # Currently the accumulator can ONLY be the biggest + # 64 bit float/int + # TODO + tgt = targetloc.value + acc = accumloc.value + if arg.type == FLOAT: + # r = (r[0]+r[1],r[0]+r[1]) + if IS_BIG_ENDIAN: + self.mc.xxpermdi(tgt, acc, acc, 0b00) + else: + self.mc.xxpermdi(tgt, acc, acc, 0b10) + if op == '+': + self.mc.xsadddp(tgt, tgt, acc) + elif op == '*': + self.mc.xsmuldp(tgt, tgt, acc) + else: + not_implemented("sum not implemented") + return + else: + assert arg.type == INT + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value) + self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET) + self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) + if op == '+': + self.mc.add(tgt, tgt, acc) + elif op == '*': + self.mc.mulld(tgt, tgt, acc) + else: + not_implemented("sum not implemented") + return + + not_implemented("reduce sum for %s not impl." % arg) def emit_vec_f(self, op, arglocs, regalloc): pass diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py --- a/rpython/jit/metainterp/test/test_vector.py +++ b/rpython/jit/metainterp/test/test_vector.py @@ -133,9 +133,9 @@ vec_float_unary = functools.partial(vec_int_unary, _vector_float_unary) - test_vec_abs_float = \ + test_vec_float_abs = \ vec_float_unary(lambda v: abs(v), rffi.DOUBLE) - test_vec_neg_float = \ + test_vec_float_neg = \ vec_float_unary(lambda v: -v, rffi.DOUBLE) # FLOAT BINARY _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit