Author: Richard Plangger <r...@pasra.at> Branch: vecopt-merge Changeset: r79002:06ec92fa38c0 Date: 2015-08-17 10:30 +0200 http://bitbucket.org/pypy/pypy/changeset/06ec92fa38c0/
Log: merged iterator sharing into the vecopt-merge diff --git a/pypy/module/micronumpy/iterators.py b/pypy/module/micronumpy/iterators.py --- a/pypy/module/micronumpy/iterators.py +++ b/pypy/module/micronumpy/iterators.py @@ -83,6 +83,10 @@ self._indices = indices self.offset = offset + def same(self, other): + if self.offset == other.offset: + return self.iterator.same_shape(other.iterator) + return False class ArrayIter(object): _immutable_fields_ = ['contiguous', 'array', 'size', 'ndim_m1', 'shape_m1[*]', @@ -100,6 +104,7 @@ self.array = array self.size = size self.ndim_m1 = len(shape) - 1 + # self.shape_m1 = [s - 1 for s in shape] self.strides = strides self.backstrides = backstrides @@ -113,6 +118,17 @@ factors[ndim-i-1] = factors[ndim-i] * shape[ndim-i] self.factors = factors + def same_shape(self, other): + """ if two iterators share the same shape, + next() only needs to be called on one! + """ + return (self.contiguous == other.contiguous and + self.array.dtype is self.array.dtype and + self.shape_m1 == other.shape_m1 and + self.strides == other.strides and + self.backstrides == other.backstrides and + self.factors == other.factors) + @jit.unroll_safe def reset(self, state=None, mutate=False): index = 0 @@ -196,7 +212,7 @@ return state.index >= self.size def getitem(self, state): - assert state.iterator is self + # assert state.iterator is self return self.array.getitem(state.offset) def getitem_bool(self, state): @@ -207,7 +223,6 @@ assert state.iterator is self self.array.setitem(state.offset, elem) - def AxisIter(array, shape, axis): strides = array.get_strides() backstrides = array.get_backstrides() diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py --- a/pypy/module/micronumpy/loop.py +++ b/pypy/module/micronumpy/loop.py @@ -2,6 +2,7 @@ operations. This is the place to look for all the computations that iterate over all the array elements. """ +import py from pypy.interpreter.error import OperationError from rpython.rlib import jit from rpython.rlib.rstring import StringBuilder @@ -13,11 +14,6 @@ from pypy.interpreter.argument import Arguments -call2_driver = jit.JitDriver( - name='numpy_call2', - greens=['shapelen', 'func', 'left', 'right', 'calc_dtype', 'res_dtype'], - reds='auto', vectorize=True) - def call2(space, shape, func, calc_dtype, w_lhs, w_rhs, out): if w_lhs.get_size() == 1: w_left = w_lhs.get_scalar_value().convert_to(space, calc_dtype) @@ -38,28 +34,96 @@ out_iter, out_state = out.create_iter(shape) shapelen = len(shape) res_dtype = out.get_dtype() - while not out_iter.done(out_state): - call2_driver.jit_merge_point(shapelen=shapelen, func=func, - left=left_iter is None, - right=right_iter is None, - calc_dtype=calc_dtype, res_dtype=res_dtype) - if left_iter: - w_left = left_iter.getitem(left_state).convert_to(space, calc_dtype) - left_state = left_iter.next(left_state) - if right_iter: - w_right = right_iter.getitem(right_state).convert_to(space, calc_dtype) - right_state = right_iter.next(right_state) - w_out = func(calc_dtype, w_left, w_right) - out_iter.setitem(out_state, w_out.convert_to(space, res_dtype)) - out_state = out_iter.next(out_state) - # if not set to None, the values will be loop carried - # (for the var,var case), forcing the vectorization to unpack - # the vector registers at the end of the loop - if left_iter: - w_left = None - if right_iter: - w_right = None - return out + call2_func = try_to_share_iterators_call2(left_iter, right_iter, + left_state, right_state, out_state) + params = (space, shapelen, func, calc_dtype, res_dtype, out, + w_left, w_right, left_iter, right_iter, out_iter, + left_state, right_state, out_state) + return call2_func(*params) + +def try_to_share_iterators_call2(left_iter, right_iter, left_state, right_state, out_state): + # these are all possible iterator sharing combinations + # left == right == out + # left == right + # left == out + # right == out + right_out_equal = False + if right_iter: + # rhs is not a scalar + if out_state.same(right_state): + right_out_equal = True + # + if not left_iter: + # lhs is a scalar + if right_out_equal: + return call2_advance_out_left + else: + # left is a scalar, and right and out do not match + return call2_advance_out_left_right + else: + # lhs is NOT a scalar + if out_state.same(left_state): + # (2) out and left are the same -> remove left + if right_out_equal: + # the best case + return call2_advance_out + else: + return call2_advance_out_right + else: + if right_out_equal: + return call2_advance_out_left + else: + if right_iter and right_state.same(left_state): + return call2_advance_out_left_eq_right + else: + return call2_advance_out_left_right + + assert 0, "logical problem with the selection of the call 2 case" + +def generate_call2_cases(name, left_state, right_state): + call2_driver = jit.JitDriver(name='numpy_call2_' + name, + greens=['shapelen', 'func', 'calc_dtype', 'res_dtype'], + reds='auto', vectorize=True) + # + advance_left_state = left_state == "left_state" + advance_right_state = right_state == "right_state" + code = """ + def method(space, shapelen, func, calc_dtype, res_dtype, out, + w_left, w_right, left_iter, right_iter, out_iter, + left_state, right_state, out_state): + while not out_iter.done(out_state): + call2_driver.jit_merge_point(shapelen=shapelen, func=func, + calc_dtype=calc_dtype, res_dtype=res_dtype) + if left_iter: + w_left = left_iter.getitem({left_state}).convert_to(space, calc_dtype) + if right_iter: + w_right = right_iter.getitem({right_state}).convert_to(space, calc_dtype) + w_out = func(calc_dtype, w_left, w_right) + out_iter.setitem(out_state, w_out.convert_to(space, res_dtype)) + out_state = out_iter.next(out_state) + if advance_left_state and left_iter: + left_state = left_iter.next(left_state) + if advance_right_state and right_iter: + right_state = right_iter.next(right_state) + # + # if not set to None, the values will be loop carried + # (for the var,var case), forcing the vectorization to unpack + # the vector registers at the end of the loop + if left_iter: + w_left = None + if right_iter: + w_right = None + return out + """ + exec(py.code.Source(code.format(left_state=left_state,right_state=right_state)).compile(), locals()) + method.__name__ = "call2_" + name + return method + +call2_advance_out = generate_call2_cases("inc_out", "out_state", "out_state") +call2_advance_out_left = generate_call2_cases("inc_out_left", "left_state", "out_state") +call2_advance_out_right = generate_call2_cases("inc_out_right", "out_state", "right_state") +call2_advance_out_left_eq_right = generate_call2_cases("inc_out_left_eq_right", "left_state", "left_state") +call2_advance_out_left_right = generate_call2_cases("inc_out_left_right", "left_state", "right_state") call1_driver = jit.JitDriver( name='numpy_call1', diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py --- a/pypy/module/micronumpy/test/test_zjit.py +++ b/pypy/module/micronumpy/test/test_zjit.py @@ -911,8 +911,10 @@ def test_multidim_slice(self): result = self.run('multidim_slice') assert result == 12 - self.check_trace_count(2) - self.check_vectorized(1,0) # TODO? + self.check_trace_count(3) + # ::2 creates a view object -> needs an inner loop + # that iterates continous chunks of the matrix + self.check_vectorized(1,1) # NOT WORKING diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py --- a/rpython/jit/backend/llgraph/runner.py +++ b/rpython/jit/backend/llgraph/runner.py @@ -155,6 +155,13 @@ def __repr__(self): return 'ArrayDescr(%r)' % (self.OUTERA,) + def is_array_of_primitives(self): + kind = getkind(self.A.OF) + return kind == 'float' or \ + kind == 'int' or \ + kind == '' + + def is_array_of_pointers(self): return getkind(self.A.OF) == 'ref' diff --git a/rpython/jit/backend/llsupport/descr.py b/rpython/jit/backend/llsupport/descr.py --- a/rpython/jit/backend/llsupport/descr.py +++ b/rpython/jit/backend/llsupport/descr.py @@ -203,6 +203,11 @@ def getconcrete_type(self): return self.concrete_type + def is_array_of_primitives(self): + return self.flag == FLAG_FLOAT or \ + self.flag == FLAG_SIGNED or \ + self.flag == FLAG_UNSIGNED + def is_array_of_pointers(self): return self.flag == FLAG_POINTER diff --git a/rpython/jit/metainterp/executor.py b/rpython/jit/metainterp/executor.py --- a/rpython/jit/metainterp/executor.py +++ b/rpython/jit/metainterp/executor.py @@ -344,6 +344,8 @@ rop.VEC_RAW_STORE, rop.VEC_GETARRAYITEM_RAW, rop.VEC_SETARRAYITEM_RAW, + rop.VEC_GETARRAYITEM_GC, + rop.VEC_SETARRAYITEM_GC, ): # list of opcodes never executed by pyjitpl continue if rop._VEC_PURE_FIRST <= value <= rop._VEC_PURE_LAST: diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py --- a/rpython/jit/metainterp/optimizeopt/dependency.py +++ b/rpython/jit/metainterp/optimizeopt/dependency.py @@ -805,8 +805,9 @@ def operation_{name}(self, op, node): descr = op.getdescr() idx_ref = self.get_or_create(op.getarg(1)) - node.memory_ref = MemoryRef(op, idx_ref, {raw_access}) - self.memory_refs[node] = node.memory_ref + if descr.is_array_of_primitives(): + node.memory_ref = MemoryRef(op, idx_ref, {raw_access}) + self.memory_refs[node] = node.memory_ref """ exec py.code.Source(array_access_source .format(name='RAW_LOAD',raw_access=True)).compile() @@ -816,6 +817,10 @@ .format(name='GETARRAYITEM_RAW',raw_access=False)).compile() exec py.code.Source(array_access_source .format(name='SETARRAYITEM_RAW',raw_access=False)).compile() + exec py.code.Source(array_access_source + .format(name='GETARRAYITEM_GC',raw_access=False)).compile() + exec py.code.Source(array_access_source + .format(name='SETARRAYITEM_GC',raw_access=False)).compile() del array_access_source integral_dispatch_opt = make_dispatcher_method(IntegralForwardModification, 'operation_') IntegralForwardModification.inspect_operation = integral_dispatch_opt diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -692,8 +692,10 @@ rop.VEC_RAW_LOAD: LOAD_TRANS, rop.VEC_GETARRAYITEM_RAW: LOAD_TRANS, + rop.VEC_GETARRAYITEM_GC: LOAD_TRANS, rop.VEC_RAW_STORE: STORE_TRANS, rop.VEC_SETARRAYITEM_RAW: STORE_TRANS, + rop.VEC_SETARRAYITEM_GC: STORE_TRANS, rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOpConv(PT_DOUBLE_2, PT_FLOAT_2), rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOpConv(PT_FLOAT_2, PT_DOUBLE_2), diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -212,21 +212,6 @@ """ self.assert_vectorize(self.parse_loop(ops), self.parse_loop(ops)) - def test_vectorize_skip_impossible_2(self): - ops = """ - [p0,i0] - i1 = int_add(i0,1) - i2 = int_le(i1, 10) - guard_true(i2) [] - i3 = getarrayitem_gc(p0,i0,descr=intarraydescr) - jump(p0,i1) - """ - try: - self.vectorize(self.parse_loop(ops)) - py.test.fail("should not happend") - except NotAVectorizeableLoop: - pass - def test_unroll_empty_stays_empty(self): """ has no operations in this trace, thus it stays empty after unrolling it 2 times """ @@ -264,6 +249,26 @@ """ self.assert_vectorize(self.parse_loop(ops), self.parse_loop(ops)) + def test_load_primitive_python_list(self): + """ it currently rejects pointer arrays """ + ops = """ + [p0,i0] + i2 = getarrayitem_gc(p0,i0,descr=floatarraydescr) + i1 = int_add(i0,1) + i3 = getarrayitem_gc(p0,i1,descr=floatarraydescr) + i4 = int_add(i1,1) + jump(p0,i4) + """ + opt = """ + [p0,i0] + i1 = int_add(i0,1) + i2 = int_add(i0,2) + i3 = vec_getarrayitem_gc(p0,i0,2,descr=floatarraydescr) + jump(p0,i2) + """ + vopt = self.vectorize(self.parse_loop(ops),0) + self.assert_equal(vopt.loop, self.parse_loop(opt)) + def test_vect_unroll_char(self): """ a 16 byte vector register can hold 16 bytes thus it is unrolled 16 times. (it is the smallest type in the trace) """ @@ -316,7 +321,7 @@ def test_estimate_unroll_factor_smallest_byte_zero(self): ops = """ [p0,i0] - raw_load(p0,i0,descr=arraydescr2) + raw_load(p0,i0,descr=arraydescr) jump(p0,i0) """ vopt = self.vectoroptimizer(self.parse_loop(ops)) @@ -326,7 +331,7 @@ def test_array_operation_indices_not_unrolled(self): ops = """ [p0,i0] - raw_load(p0,i0,descr=arraydescr2) + raw_load(p0,i0,descr=arraydescr) jump(p0,i0) """ vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0) diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -253,13 +253,12 @@ def linear_find_smallest_type(self, loop): # O(#operations) for i,op in enumerate(loop.operations): - if op.is_raw_array_access(): + if op.is_primitive_array_access(): descr = op.getdescr() - if not descr.is_array_of_pointers(): - byte_count = descr.get_item_size_in_bytes() - if self.smallest_type_bytes == 0 \ - or byte_count < self.smallest_type_bytes: - self.smallest_type_bytes = byte_count + byte_count = descr.get_item_size_in_bytes() + if self.smallest_type_bytes == 0 \ + or byte_count < self.smallest_type_bytes: + self.smallest_type_bytes = byte_count def get_unroll_count(self, simd_vec_reg_bytes): """ This is an estimated number of further unrolls """ @@ -667,7 +666,7 @@ if origin_pack is None: descr = lnode.getoperation().getdescr() ptype = PackType.by_descr(descr, self.vec_reg_size) - if lnode.getoperation().is_raw_load(): + if lnode.getoperation().is_primitive_load(): # load outputs value, no input return Pair(lnode, rnode, None, ptype) else: @@ -710,7 +709,7 @@ """ Blocks the packing of some operations """ if inquestion.vector == -1: return True - if packed.is_raw_array_access(): + if packed.is_primitive_array_access(): if packed.getarg(1) == inquestion.result: return True if not forward and inquestion.getopnum() == rop.INT_SIGNEXT: diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py --- a/rpython/jit/metainterp/resoperation.py +++ b/rpython/jit/metainterp/resoperation.py @@ -174,10 +174,19 @@ def is_raw_array_access(self): return self.is_raw_load() or self.is_raw_store() - def is_raw_load(self): + def is_primitive_array_access(self): + """ Indicates that this operations loads/stores a + primitive type (int,float) """ + if self.is_primitive_load() or self.is_primitive_store(): + descr = self.getdescr() + if descr.is_array_of_primitives(): + return True + return False + + def is_primitive_load(self): return rop._RAW_LOAD_FIRST < self.getopnum() < rop._RAW_LOAD_LAST - def is_raw_store(self): + def is_primitive_store(self): return rop._RAW_STORE_FIRST < self.getopnum() < rop._RAW_STORE_LAST def is_comparison(self): @@ -568,13 +577,13 @@ # '_ALWAYS_PURE_LAST', # ----- end of always_pure operations ----- + '_RAW_LOAD_FIRST', 'GETARRAYITEM_GC/2d', - - '_RAW_LOAD_FIRST', 'GETARRAYITEM_RAW/2d', 'VEC_GETARRAYITEM_RAW/3d', 'RAW_LOAD/2d', 'VEC_RAW_LOAD/3d', + 'VEC_GETARRAYITEM_GC/3d', '_RAW_LOAD_LAST', 'GETINTERIORFIELD_GC/2d', @@ -596,13 +605,14 @@ '_NOSIDEEFFECT_LAST', # ----- end of no_side_effect operations ----- 'INCREMENT_DEBUG_COUNTER/1', - 'SETARRAYITEM_GC/3d', '_RAW_STORE_FIRST', + 'SETARRAYITEM_GC/3d', 'SETARRAYITEM_RAW/3d', 'VEC_SETARRAYITEM_RAW/3d', 'RAW_STORE/3d', 'VEC_RAW_STORE/3d', + 'VEC_SETARRAYITEM_GC/3d', '_RAW_STORE_LAST', 'SETINTERIORFIELD_GC/3d', @@ -796,8 +806,10 @@ _opvector = { rop.RAW_LOAD: rop.VEC_RAW_LOAD, rop.GETARRAYITEM_RAW: rop.VEC_GETARRAYITEM_RAW, + rop.GETARRAYITEM_GC: rop.VEC_GETARRAYITEM_GC, rop.RAW_STORE: rop.VEC_RAW_STORE, rop.SETARRAYITEM_RAW: rop.VEC_SETARRAYITEM_RAW, + rop.SETARRAYITEM_GC: rop.VEC_SETARRAYITEM_GC, rop.INT_ADD: rop.VEC_INT_ADD, rop.INT_SUB: rop.VEC_INT_SUB, _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit