Author: Richard Plangger <r...@pasra.at> Branch: vecopt Changeset: r77328:2a8cae0c7c8e Date: 2015-05-15 09:39 +0200 http://bitbucket.org/pypy/pypy/changeset/2a8cae0c7c8e/
Log: resolved an issue that would generate wrong packing immediates for shufps. a better approach in the assembler is needed to handle these pack/unpack instructions diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py --- a/pypy/module/micronumpy/test/test_zjit.py +++ b/pypy/module/micronumpy/test/test_zjit.py @@ -6,10 +6,16 @@ from rpython.jit.metainterp.test.support import LLJitMixin from rpython.jit.backend.x86.test.test_basic import Jit386Mixin from rpython.jit.metainterp.warmspot import reset_jit, get_stats +from rpython.jit.metainterp.jitprof import Profiler +from rpython.rlib.jit import Counters from pypy.module.micronumpy import boxes from pypy.module.micronumpy.compile import FakeSpace, Parser, InterpreterState from pypy.module.micronumpy.base import W_NDimArray +def get_profiler(): + from rpython.jit.metainterp import pyjitpl + return pyjitpl._warmrunnerdesc.metainterp_sd.profiler + class TestNumpyJit(Jit386Mixin): graph = None interp = None @@ -79,12 +85,23 @@ listcomp=True, backendopt=True, graph_and_interp_only=True, + ProfilerClass=Profiler, vectorize=True) self.__class__.interp = interp self.__class__.graph = graph + def check_vectorized(self, expected_tried, expected_success): + profiler = get_profiler() + tried = profiler.get_counter(Counters.OPT_VECTORIZE_TRY) + success = profiler.get_counter(Counters.OPT_VECTORIZED) + assert tried >= success + assert tried == expected_tried + assert success == expected_success + def run(self, name): self.compile_graph() + profiler = get_profiler() + profiler.start() reset_jit() i = self.code_mapping[name] retval = self.interp.eval_graph(self.graph, [i]) @@ -92,23 +109,25 @@ def define_float32_add(): return """ - a = |30| + a = astype(|30|, float32) b = a + a b -> 15 """ def test_float32_add(self): result = self.run("float32_add") self.assert_float_equal(result, 15.0 + 15.0) + self.check_vectorized(2, 2) def define_float_add(): return """ - a = astype(|30|, float32) + a = |30| b = a + a - b -> 17 + b -> 15 """ def test_float_add(self): result = self.run("float_add") self.assert_float_equal(result, 17.0 + 17.0) + self.check_vectorized(1, 1) def define_float32_add_const(): return """ @@ -119,6 +138,7 @@ def test_float32_add_const(self): result = self.run("float32_add_const") self.assert_float_equal(result, 29.0 + 77.345) + self.check_vectorized(2, 2) def define_float_add_const(): return """ @@ -128,6 +148,7 @@ def test_float_add_const(self): result = self.run("float_add_const") self.assert_float_equal(result, 29.0 + 25.5) + self.check_vectorized(1, 1) def define_pow(): return """ diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -7,7 +7,7 @@ DEBUG_COUNTER, debug_bridge) from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper from rpython.jit.backend.llsupport.gcmap import allocate_gcmap -from rpython.jit.metainterp.history import Const, Box, VOID, BoxVector +from rpython.jit.metainterp.history import Const, Box, VOID, BoxVector, ConstInt from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory from rpython.rtyper.lltypesystem.lloperation import llop @@ -2576,30 +2576,36 @@ return src_loc select = 0 if item_type == FLOAT: - self.mc.MOVSS(tmp_loc, src_loc) - i = 0 - while i < count: - select |= (index+i<<(i*2)) - i += 1 - self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select) - return tmp_loc + if size == 4: + self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx + i = 0 + while i < count: + select |= (index+i<<(i*2)) + i += 1 + self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select) + return tmp_loc + else: + py.test.set_trace() + raise NotImplementedError("shuffle by index for float64 not impl") else: py.test.set_trace() raise NotImplementedError("shuffle by index for non floats") def genop_vec_box_pack(self, op, arglocs, resloc): - toloc, fromloc, indexloc, sizeloc = arglocs - toarg = op.getarg(0) - index = indexloc.value - size = sizeloc.value + toloc, fromloc, tmploc = arglocs + result = op.result + indexarg = op.getarg(2) + assert isinstance(result, BoxVector) + assert isinstance(indexarg, ConstInt) + index = indexarg.value + size = result.item_size + #py.test.set_trace() if size == 4: - select = 0 + select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc + # TODO if index == 2: - select |= (1<<0) - select |= (2<<2) - select |= (3<<4) - select |= (4<<6) + select |= (1<<6) # move 0 -> 2, 1 -> 3 for fromloc else: raise NotImplementedError("index is not equal to 2") @@ -2621,7 +2627,7 @@ self.mc.CVTPS2PD(resloc, loc0) else: assert index == 2 - self.mc.MOVSS_xx(tmploc.value, loc0.value) + self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx select = (2<<0)|(3<<2) # move pos 2->0,3->1 self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select) self.mc.CVTPS2PD(resloc, tmploc) # expand diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -1477,7 +1477,7 @@ assert not descr.is_array_of_pointers() and \ not descr.is_array_of_structs() itemsize, ofs, _ = unpack_arraydescr(descr) - integer = not descr.is_array_of_floats() + integer = not (descr.is_array_of_floats() or descr.concrete_type == FLOAT) aligned = False args = op.getarglist() base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args) @@ -1498,7 +1498,7 @@ value_loc = self.make_sure_var_in_reg(op.getarg(2), args) ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args) - integer = not descr.is_array_of_floats() + integer = not (descr.is_array_of_floats() or descr.concrete_type == FLOAT) aligned = False self.perform_discard(op, [base_loc, ofs_loc, value_loc, imm(itemsize), imm(ofs), imm(integer), imm(aligned)]) @@ -1536,15 +1536,13 @@ del consider_vec_logic def consider_vec_box_pack(self, op): - count = op.getarg(3) - index = op.getarg(2) - assert isinstance(count, ConstInt) - assert isinstance(index, ConstInt) - itemsize = self.assembler.cpu.vector_register_size // count.value args = op.getarglist() - loc0 = self.make_sure_var_in_reg(op.getarg(0), args) loc1 = self.make_sure_var_in_reg(op.getarg(1), args) - self.perform(op, [loc0, loc1, imm(index.value), imm(itemsize)], None) + result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args) + tmpxvar = TempBox() + tmploc = self.xrm.force_allocate_reg(tmpxvar) + self.xrm.possibly_free_var(tmpxvar) + self.perform(op, [result, loc1, tmploc], result) def consider_vec_box_unpack(self, op): count = op.getarg(2) diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py --- a/rpython/jit/metainterp/history.py +++ b/rpython/jit/metainterp/history.py @@ -563,7 +563,7 @@ raise NotImplementedError("cannot forget value of vector") def clonebox(self): - return BoxVector(self.item_type, self.item_count) + return BoxVector(self.item_type, self.item_count, self.item_size, self.signed) def constbox(self): raise NotImplementedError("not possible to have a constant vector box") diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -1192,8 +1192,8 @@ v224 = vec_float_add(v219, v222, 2) v225 = vec_cast_float_to_singlefloat(v223, 2) v226 = vec_cast_float_to_singlefloat(v224, 2) - vec_box_pack(v225, v226, 2, 2) - vec_raw_store(p2, i4, v225, 4, descr=singlefloatarraydescr) + v227 = vec_box_pack(v225, v226, 2, 2) + vec_raw_store(p2, i4, v227, 4, descr=singlefloatarraydescr) jump(p0, p1, p2, i210, i189) """ vopt = self.vectorize(self.parse_loop(ops)) diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -13,6 +13,7 @@ from rpython.jit.metainterp.resoperation import (rop, ResOperation, GuardResOp) from rpython.rlib.objectmodel import we_are_translated from rpython.rlib.debug import debug_print, debug_start, debug_stop +from rpython.rlib.jit import Counters from rpython.rtyper.lltypesystem import lltype, rffi class NotAVectorizeableLoop(JitException): @@ -42,10 +43,10 @@ inline_short_preamble, start_state, False) orig_ops = loop.operations try: - jitdriver_sd.profiler.count(Counters.OPT_VECTORIZE_TRY) + metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY) opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, optimizations) opt.propagate_all_forward() - jitdriver_sd.profiler.count(Counters.OPT_VECTORIZED) + metainterp_sd.profiler.count(Counters.OPT_VECTORIZED) except NotAVectorizeableLoop: # vectorization is not possible, propagate only normal optimizations loop.operations = orig_ops @@ -690,8 +691,6 @@ else: # vbox of a variable/constant is not present here pass - if not we_are_translated(): - assert ptype.is_valid() self.pack.ptype = ptype def vector_result(self, vop, packargs): @@ -731,6 +730,7 @@ if packed < packable: args = [op.getoperation().getarg(argidx) for op in ops] self.package(vbox, packed, args, packable) + _, vbox = self.box_to_vbox.get(vop.getarg(argidx), (-1, None)) vop.setarg(argidx, vbox) return vbox @@ -749,13 +749,40 @@ if pos == -1: i += 1 continue + new_box = tgt_box.clonebox() + new_box.item_count += src_box.item_count op = ResOperation(rop.VEC_BOX_PACK, [tgt_box, src_box, ConstInt(i), - ConstInt(src_box.item_count)], None) + ConstInt(src_box.item_count)], new_box) self.preamble_ops.append(op) - tgt_box.item_count += src_box.item_count + self._check_vec_pack(op) i += src_box.item_count + # overwrite the new positions, arguments now live in new_box + # at a new position + for j in range(i): + arg = args[j] + self.box_to_vbox[arg] = (j, new_box) + + def _check_vec_pack(self, op): + result = op.result + arg0 = op.getarg(0) + arg1 = op.getarg(1) + index = op.getarg(2) + count = op.getarg(3) + assert isinstance(result, BoxVector) + assert isinstance(arg0, BoxVector) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + assert arg0.item_size == result.item_size + if isinstance(arg1, BoxVector): + assert arg1.item_size == result.item_size + else: + assert count.value == 1 + assert index.value < result.item_size + assert index.value + count.value <= result.item_size + assert result.item_count > arg0.item_count + def expand_box_to_vector_box(self, vop, argidx): arg = vop.getarg(argidx) all_same_box = True _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit