Author: Richard Plangger <r...@pasra.at> Branch: vecopt Changeset: r78330:9150ce6cdf52 Date: 2015-06-27 18:10 +0200 http://bitbucket.org/pypy/pypy/changeset/9150ce6cdf52/
Log: added guard_true/false for vector register as first argument diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py --- a/pypy/module/micronumpy/test/test_zjit.py +++ b/pypy/module/micronumpy/test/test_zjit.py @@ -276,7 +276,11 @@ def test_int8_expand(self): result = self.run("int8_expand") assert int(result) == 17*8 + sum(range(0,17)) - self.check_vectorized(3, 1) # TODO sum at the end + # does not pay off to cast float64 -> int8 + # neither does sum + # a + c should work, but it is given as a parameter + # thus the accum must handle this! + self.check_vectorized(3, 0) # TODO def define_int32_add_const(): return """ @@ -535,25 +539,109 @@ def define_any(): return """ + a = astype([0,0,0,0,0,0,0,1,0,0,0],int8) + any(a) + """ + + def define_any_int(): + return """ + a = astype([0,0,0,0,256,65537,0,0,0,0,0],int16) + any(a) + """ + + def define_any_ret_0(): + return """ + a = astype([0,0,0,0,0,0,0,0,0,0,0],int64) + any(a) + """ + + def define_float_any(): + return """ a = [0,0,0,0,0,0,0,1,0,0,0] any(a) """ + def define_float32_any(): + return """ + a = astype([0,0,0,0,0,0,0,1,0,0,0], float32) + any(a) + """ + + def test_float_any(self): + result = self.run("float_any") + assert int(result) == 1 + self.check_vectorized(2, 2) + + def test_float32_any(self): + result = self.run("float32_any") + assert int(result) == 1 + self.check_vectorized(1, 1) + def test_any(self): - result = self.run("any") - assert result == 1 - self.check_vectorized(1, 0) + result = self.run("float_any") + assert int(result) == 1 + self.check_vectorized(1, 1) + + def test_any_int(self): + result = self.run("any_int") + assert int(result) == 1 + self.check_vectorized(2, 1) + + def test_any_ret_0(self): + result = self.run("any_ret_0") + assert int(result) == 0 + self.check_vectorized(2, 2) def define_all(): return """ + a = astype([1,1,1,1,1,1,1,1],int32) + all(a) + """ + def define_all_int(): + return """ + a = astype([1,100,255,1,3,1,1,1],int32) + all(a) + """ + def define_all_ret_0(): + return """ + a = astype([1,1,1,1,1,0,1,1],int32) + all(a) + """ + def define_float_all(): + return """ a = [1,1,1,1,1,1,1,1] all(a) """ + def define_float32_all(): + return """ + a = astype([1,1,1,1,1,1,1,1],float32) + all(a) + """ + + def test_float_all(self): + result = self.run("float_all") + assert int(result) == 1 + self.check_vectorized(2, 2) + + def test_float_all(self): + result = self.run("float32_all") + assert int(result) == 1 + self.check_vectorized(2, 2) def test_all(self): result = self.run("all") - assert result == 1 - self.check_vectorized(1, 1) + assert int(result) == 1 + self.check_vectorized(2, 2) + + def test_all_int(self): + result = self.run("all_int") + assert int(result) == 1 + self.check_vectorized(2, 2) + + def test_all_ret_0(self): + result = self.run("all_ret_0") + assert int(result) == 0 + self.check_vectorized(2, 2) def define_logical_xor_reduce(): return """ diff --git a/rpython/doc/jit/vectorization.rst b/rpython/doc/jit/vectorization.rst --- a/rpython/doc/jit/vectorization.rst +++ b/rpython/doc/jit/vectorization.rst @@ -54,5 +54,8 @@ The opcode needed spans over multiple instructions. In terms of performance there might only be little to non advantage to use SIMD instructions for this conversions. +* For a guard that checks true/false on a vector integer regsiter, it would be handy + to have 2 xmm registers (one filled with zero bits and the other with one every bit). + This cuts down 2 instructions for guard checking, trading for higher register pressure. .. _PMUL: http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025 diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -1644,10 +1644,38 @@ self.mc.MOVD32_xr(resloc.value, eax.value) self.mc.PUNPCKLDQ_xx(resloc.value, loc1.value) + def genop_guard_vector_arg(self, guard_op, loc): + arg = guard_op.getarg(0) + assert isinstance(arg, BoxVector) + size = arg.item_size + temp = X86_64_XMM_SCRATCH_REG + # + self.mc.PXOR(temp, temp) + # if the vector is not fully packed blend 1s + if not arg.fully_packed(self.cpu.vector_register_size): + self.mc.PCMPEQQ(temp, temp) # fill with ones + select = 0 + bits_used = (arg.item_count * arg.item_size * 8) + index = bits_used // 16 + while index < 8: + select |= (1 << index) + index += 1 + self.mc.PBLENDW_xxi(loc, temp, select) + # reset to zeros + self.mc.PXOR(temp, temp) + + self.mc.PCMPEQ(size, loc, temp) + self.mc.PCMPEQQ(temp, temp) + self.mc.PTEST(loc, temp) + def genop_guard_guard_true(self, ign_1, guard_op, guard_token, locs, ign_2): loc = locs[0] - self.mc.TEST(loc, loc) - self.implement_guard(guard_token, 'Z') + if loc.is_xmm: + self.genop_guard_vector_arg(guard_op, loc) + self.implement_guard(guard_token, 'Z') + else: + self.mc.TEST(loc, loc) + self.implement_guard(guard_token, 'Z') genop_guard_guard_nonnull = genop_guard_guard_true def genop_guard_guard_no_exception(self, ign_1, guard_op, guard_token, @@ -1724,8 +1752,12 @@ def genop_guard_guard_false(self, ign_1, guard_op, guard_token, locs, ign_2): loc = locs[0] - self.mc.TEST(loc, loc) - self.implement_guard(guard_token, 'NZ') + if loc.is_xmm: + self.genop_guard_vector_arg(guard_op, loc) + self.implement_guard(guard_token, 'Z') + else: + self.mc.TEST(loc, loc) + self.implement_guard(guard_token, 'NZ') genop_guard_guard_isnull = genop_guard_guard_false def genop_guard_guard_value(self, ign_1, guard_op, guard_token, locs, ign_2): @@ -2723,7 +2755,7 @@ def genop_vec_int_expand(self, op, arglocs, resloc): srcloc, sizeloc = arglocs if not isinstance(srcloc, RegLoc): - self.mov(X86_64_SCRATCH_REG, srcloc) + self.mov(srcloc, X86_64_SCRATCH_REG) srcloc = X86_64_SCRATCH_REG assert not srcloc.is_xmm size = sizeloc.value diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -390,12 +390,22 @@ return self.xrm.loc(v) return self.rm.loc(v) + def _consider_guard_tf(self, op): + arg = op.getarg(0) + if arg.type == VECTOR: + assert arg.item_type == INT + loc = self.xrm.make_sure_var_in_reg(arg) + else: + loc = self.rm.make_sure_var_in_reg(arg) + self.perform_guard(op, [loc], None) + + consider_guard_true = _consider_guard_tf + consider_guard_false = _consider_guard_tf + def _consider_guard(self, op): loc = self.rm.make_sure_var_in_reg(op.getarg(0)) self.perform_guard(op, [loc], None) - consider_guard_true = _consider_guard - consider_guard_false = _consider_guard consider_guard_nonnull = _consider_guard consider_guard_isnull = _consider_guard diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py --- a/rpython/jit/backend/x86/regloc.py +++ b/rpython/jit/backend/x86/regloc.py @@ -601,6 +601,28 @@ self._reuse_scratch_register = False self._scratch_register_known = False + def _vector_size_choose(name): + def invoke(self, suffix, val1, val2): + methname = name + suffix + _rx86_getattr(self, methname)(val1, val2) + invoke._annspecialcase_ = 'specialize:arg(1)' + + def INSN(self, size, loc1, loc2): + code1 = loc1.location_code() + code2 = loc2.location_code() + val1 = getattr(loc1, "value_" + code1)() + val2 = getattr(loc2, "value_" + code2)() + suffix = 'B' + if size == 2: + suffix = 'W' + elif size == 4: + suffix = 'D' + else: + suffix = 'Q' + invoke(self, suffix + "_"+ code1+code2, val1, val2) + + return INSN + AND = _binaryop('AND') OR = _binaryop('OR') OR8 = _binaryop('OR8') @@ -610,6 +632,7 @@ SHR = _binaryop('SHR') SAR = _binaryop('SAR') TEST = _binaryop('TEST') + PTEST = _binaryop('PTEST') TEST8 = _binaryop('TEST8') BTS = _binaryop('BTS') @@ -621,6 +644,11 @@ CMP = _binaryop('CMP') CMP16 = _binaryop('CMP16') + PCMPEQQ = _binaryop('PCMPEQQ') + PCMPEQD = _binaryop('PCMPEQD') + PCMPEQW = _binaryop('PCMPEQW') + PCMPEQB = _binaryop('PCMPEQB') + PCMPEQ = _vector_size_choose('PCMPEQ') MOV = _binaryop('MOV') MOV8 = _binaryop('MOV8') MOV16 = _binaryop('MOV16') @@ -698,7 +726,6 @@ PAND = _binaryop('PAND') POR = _binaryop('POR') PXOR = _binaryop('PXOR') - PCMPEQD = _binaryop('PCMPEQD') PSRLDQ = _binaryop('PSRLDQ') MOVDQ = _binaryop('MOVDQ') diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py --- a/rpython/jit/backend/x86/rx86.py +++ b/rpython/jit/backend/x86/rx86.py @@ -766,6 +766,8 @@ PINSRB_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x20', register(1,8), register(2), '\xC0', immediate(3, 'b')) INSERTPS_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x21', register(1,8), register(2), '\xC0', immediate(3, 'b')) + PTEST_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x17', register(1,8), register(2), '\xC0') + PBLENDW_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x0E', register(1,8), register(2), '\xC0', immediate(3, 'b')) # ------------------------------------------------------------ @@ -1003,7 +1005,10 @@ define_pxmm_insn('PUNPCKHDQ_x*', '\x6A') define_pxmm_insn('PUNPCKLQDQ_x*', '\x6C') define_pxmm_insn('PUNPCKHQDQ_x*', '\x6D') +define_pxmm_insn('PCMPEQQ_x*', '\x38\x29') define_pxmm_insn('PCMPEQD_x*', '\x76') +define_pxmm_insn('PCMPEQW_x*', '\x75') +define_pxmm_insn('PCMPEQB_x*', '\x74') # ____________________________________________________________ diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py --- a/rpython/jit/metainterp/history.py +++ b/rpython/jit/metainterp/history.py @@ -540,6 +540,9 @@ def getcount(self): return self.item_count + def fully_packed(self, vec_reg_size): + return self.item_size * self.item_count == vec_reg_size + def forget_value(self): raise NotImplementedError("cannot forget value of vector") diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -367,7 +367,7 @@ # the argument has more items than the operation is able to process! # box_pos == 0 then it is already at the right place if box_pos != 0: - args[i] = self.unpack(vbox, box_pos, packable, self.input_type) + args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) self.update_arg_in_vector_pos(i, args[i]) #self.update_input_output(self.pack) continue @@ -384,7 +384,7 @@ if box_pos != 0: # The vector box is at a position != 0 but it # is required to be at position 0. Unpack it! - args[i] = self.unpack(vbox, box_pos, packable, self.input_type) + args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) self.update_arg_in_vector_pos(i, args[i]) continue #self.update_input_output(self.pack) @@ -450,6 +450,7 @@ def unpack(self, vbox, index, count, arg_ptype): assert index < vbox.item_count assert index + count <= vbox.item_count + assert count > 0 vbox_cloned = vectorbox_clone_set(vbox, count=count) opnum = getunpackopnum(vbox.item_type) op = ResOperation(opnum, [vbox, ConstInt(index), ConstInt(count)], vbox_cloned) @@ -787,7 +788,6 @@ def setvector_of_box(self, box, off, vector): assert off < vector.item_count - print "set" , box, "[",off,"] =", vector self.box_to_vbox[box] = (off, vector) def prepend_invariant_operations(self, oplist): diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -1357,40 +1357,23 @@ def test_abc(self): trace=""" - label(p0, p1, p5, p6, p7, p17, p19, i53, i39, i44, i49, i51, descr=TargetToken(140531585719072)) - guard_not_invalidated(descr=<Guard0x7fd00f3ebdb0>) [p1, p0, p5, p6, p7, p17, p19] - i63 = int_ge(i53, 2024) - guard_false(i63, descr=<Guard0x7fd00f3ebe08>) [p1, p0, p5, p6, p7, p17, p19, i53] - i64 = int_lt(i53, i39) - guard_true(i64, descr=<Guard0x7fd00f3ebe60>) [p1, p0, i53, p5, p6, p7, p17, p19, None] - f65 = getarrayitem_raw(i44, i53, descr=floatarraydescr) - f66 = float_add(f65, 1.000000) - i67 = int_lt(i53, i49) - guard_true(i67, descr=<Guard0x7fd00f3ebeb8>) [p1, p0, i53, p5, p6, p7, p17, p19, f66, None] - setarrayitem_raw(i51, i53, f66, descr=floatarraydescr) - i68 = int_add(i53, 1) - i69 = getfield_raw(140531584083072, descr=<FieldS pypysig_long_struct.c_value 0>) - setfield_gc(59, i68, descr=<FieldS pypy.objspace.std.typeobject.IntMutableCell.inst_intvalue 8>) - i70 = int_lt(i69, 0) - guard_false(i70, descr=<Guard0x7fd00f3ebf10>) [p1, p0, p5, p6, p7, p17, p19, None, None] - jump(p0, p1, p5, p6, p7, p17, p19, i68, i39, i44, i49, i51) - """ - trace=""" - [p0, p1, p9, i10, p4, i11, p3, p6, p12, i13, i14, i15, f16, i17, i18] - guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f2327d4b390>) [p6, p4, p3, p1, p0, i14, i10, i13, i11, p9, p12] - i19 = raw_load(i15, i11, descr=singlefloatarraydescr) - guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f23284786d0>) [p6, p4, p3, p1, p0, i19, i14, i10, i13, i11, p9, p12] - i21 = int_add(i11, 4) - f22 = cast_singlefloat_to_float(i19) - f23 = float_add(f22, f16) - i24 = cast_float_to_singlefloat(f23) - raw_store(i17, i14, i24, descr=singlefloatarraydescr) - i26 = int_add(i13, 1) - i28 = int_add(i14, 4) - i29 = int_ge(i26, i18) - guard_false(i29, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f2327d53910>) [p6, p4, p3, p1, p0, i28, i21, i26, None, i10, None, None, p9, p12] - debug_merge_point(0, 0, '(numpy_call2: no get_printable_location)') - jump(p0, p1, p9, i10, p4, i21, p3, p6, p12, i26, i28, i15, f16, i17, i18) + [p0, p9, i10, p3, i11, p12, i13, p6, i14, p7, p15, i16, i17, i18, i19, i20, i21] + guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f09b34aad50>) [p7, p6, p3, p0, i14, i17, i16, p9, p15, i11, i10, p12, i13] + i22 = raw_load(i18, i11, descr=singlefloatarraydescr) + guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f09b34fd390>) [p7, p6, p3, p0, i22, i14, i17, i16, p9, p15, i11, i10, p12, i13] + i24 = int_add(i11, 4) + i25 = raw_load(i19, i17, descr=singlefloatarraydescr) + i27 = int_add(i17, 4) + f28 = cast_singlefloat_to_float(i22) + f29 = cast_singlefloat_to_float(i25) + f30 = float_add(f28, f29) + i31 = cast_float_to_singlefloat(f30) + raw_store(i20, i14, i31, descr=singlefloatarraydescr) + i33 = int_add(i13, 1) + i35 = int_add(i14, 4) + i36 = int_ge(i33, i21) + guard_false(i36, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f09b34b7c10>) [p7, p6, p3, p0, i35, i24, i33, i27, None, None, i16, p9, p15, None, i10, p12, None] + jump(p0, p9, i10, p3, i24, p12, i33, p6, i35, p7, p15, i16, i27, i18, i19, i20, i21) """ opt = self.vectorize(self.parse_loop(trace)) self.debug_print_operations(opt.loop) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit