Author: Richard Plangger <r...@pasra.at> Branch: vecopt-merge Changeset: r79319:75f6354522df Date: 2015-08-31 14:33 +0200 http://bitbucket.org/pypy/pypy/changeset/75f6354522df/
Log: resolving some issues introduced by the simpler combination and separate splitting phase, needs to remove packs that are not full diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -240,6 +240,11 @@ def getcount(self): return self.count + def pack_byte_size(self, pack): + if len(pack.operations) == 0: + return 0 + return self.getsize() * pack.opcount() + PT_GENERIC = PackType(PackType.UNKNOWN_TYPE, -1, False) PT_FLOAT_2 = PackType(FLOAT, 4, False, 2) @@ -873,11 +878,43 @@ assert not isinstance(box, BoxVector) self.box_to_vbox[box] = (off, vector) +def opcount_filling_vector_register(pack, vec_reg_size): + """ how many operations of that kind can one execute + with a machine instruction of register size X? + """ + pack_type = pack.input_type + if pack_type is None: + pack_type = pack.output_type # load operations + + op = pack.leftmost() + if op.casts_box(): + count = pack_type.getcount() + return count + + count = vec_reg_size // pack_type.getsize() + return count + +def maximum_byte_size(pack, vec_reg_size): + """ The maxmum size in bytes the operation is able to + process with the hardware register and the operation + semantics. + """ + op = pack.leftmost() + if op.casts_box(): + # casting is special, often only takes a half full vector + pack_type = pack.input_type + if pack_type is None: + pack_type = self.output_type # load operations + return pack_type.byte_size() + return vec_reg_size + class Pack(object): """ A pack is a set of n statements that are: * isomorphic * independent """ + FULL = 0 + def __init__(self, ops, input_type, output_type): self.operations = ops self.accum = None @@ -899,30 +936,43 @@ ptype = self.output_type return ptype - def pack_byte_size(self): - return self.pack_type().getsize() * self.opcount() + def input_byte_size(self): + """ The amount of bytes the operations need with the current + entries in self.operations. E.g. cast_singlefloat_to_float + takes only #2 operations. + """ + return self._byte_size(self.input_type) + + def output_byte_size(self): + """ The amount of bytes the operations need with the current + entries in self.operations. E.g. vec_load(..., descr=short) + with 10 operations returns 20 + """ + return self._byte_size(self.output_type) + + def pack_load(self, vec_reg_size): + """ Returns the load of the pack. A value + smaller than 0 indicates that it is empty + or nearly empty, zero indicates that all slots + are used and > 0 indicates that too many operations + are in this pack instance. + """ + if len(self.operations) == 0: + return -1 + size = maximum_byte_size(self, vec_reg_size) + if self.input_type is None: + # e.g. load operations + return self.output_type.pack_byte_size(self) - size + # default only consider the input type + # e.g. store operations, int_add, ... + return self.input_type.pack_byte_size(self) - size + def is_full(self, vec_reg_size): """ If one input element times the opcount is equal to the vector register size, we are full! """ - ptype = self.pack_type() - op = self.leftmost() - if op.casts_box(): - cur_bytes = ptype.getsize() * self.opcount() - max_bytes = self.input_type.byte_size() - assert cur_bytes <= max_bytes - return cur_bytes == max_bytes - - bytes = self.pack_byte_size() - assert bytes <= vec_reg_size - if bytes == vec_reg_size: - return True - if ptype.getcount() != -1: - size = ptype.getcount() * ptype.getsize() - assert bytes <= size - return bytes == size - return False + return self.pack_load(vec_reg_size) == Pack.FULL def opnum(self): assert len(self.operations) > 0 @@ -930,9 +980,8 @@ def clear(self): for node in self.operations: - if node.pack is not self: - node.pack = None - node.pack_position = -1 + node.pack = None + node.pack_position = -1 def update_pack_of_nodes(self): for i,node in enumerate(self.operations): @@ -945,19 +994,28 @@ vector register. """ pack = self - pack_type = self.pack_type() - max_count = vec_reg_size // pack_type.getsize() - assert max_count * pack_type.getsize() == vec_reg_size - while pack.pack_byte_size() > vec_reg_size: - assert max_count > 0 - newpack = pack.clone() - oplist = pack.operations[:max_count] - newpack.operations = pack.operations[max_count:] + while pack.pack_load(vec_reg_size) > Pack.FULL: + pack.clear() + oplist, newoplist = pack.slice_operations(vec_reg_size) pack.operations = oplist pack.update_pack_of_nodes() - newpack.update_pack_of_nodes() - pack = newpack - packlist.append(newpack) + assert pack.is_full(vec_reg_size) + # + newpack = pack.clone(newoplist) + load = newpack.pack_load(vec_reg_size) + if load >= Pack.FULL: + pack = newpack + packlist.append(newpack) + else: + newpack.clear() + + def slice_operations(self, vec_reg_size): + count = opcount_filling_vector_register(self, vec_reg_size) + newoplist = self.operations[count:] + oplist = self.operations[:count] + assert len(newoplist) + len(oplist) == len(self.operations) + assert len(newoplist) != 0 + return oplist, newoplist def rightmost_match_leftmost(self, other): """ Check if pack A can be combined with pack B """ @@ -974,14 +1032,16 @@ return rightmost is leftmost and accum def __repr__(self): + if len(self.operations) == 0: + return "Pack(-, [])" opname = self.operations[0].getoperation().getopname() return "Pack(%s,%r)" % (opname, self.operations) def is_accumulating(self): return self.accum is not None - def clone(self): - cloned = Pack(self.operations, self.input_type, self.output_type) + def clone(self, oplist): + cloned = Pack(oplist, self.input_type, self.output_type) cloned.accum = self.accum return cloned diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py --- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py @@ -452,3 +452,19 @@ v9[i64|2] = vec_int_and(v4[i64|2], v8[i64|2]) """, False) self.assert_equal(loop2, loop3) + + def test_split_cast(self): + trace = self.parse(""" + f10 = cast_int_to_float(i1) + f11 = cast_int_to_float(i2) + f12 = cast_int_to_float(i3) + f13 = cast_int_to_float(i4) + """) + pack = self.pack(trace, 0, 4, I64, F32) + packs = [] + pack.split(packs, 16) + packs.append(pack) + assert len(packs) == 2 + + + diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -1427,5 +1427,51 @@ opt = self.schedule(self.parse_loop(trace)) self.debug_print_operations(opt.loop) + def test_arraylen(self): + trace = """ + [i45, i33, p40] + # while i < len(l): + # LOAD_FAST i + # LOAD_GLOBAL len + guard_not_invalidated(descr=<Guard0x7f82c00c3518>) [i33,p40] + # LOAD_FAST l + # CALL_FUNCTION 1 + # COMPARE_OP < + i50 = int_lt(i45, i33) + guard_true(i50) [i50,i33,p40] + # POP_JUMP_IF_FALSE 70 + # l[i] = l[i] + 1 + # LOAD_FAST l + # LOAD_FAST i + # BINARY_SUBSCR + i51 = uint_ge(i45, i33) + guard_false(i51) [i50, i45] + i52 = getarrayitem_gc(p40, i45, descr=intarraydescr) + # LOAD_CONST 1 + # BINARY_ADD + i53 = int_add(i52, 1) + #guard_no_overflow(descr=<Guard0x7f82c00c33b8>) [] + # LOAD_FAST l + # LOAD_FAST i + # STORE_SUBSCR + setarrayitem_gc(p40, i45, i53, descr=intarraydescr) + # i += 1 + # LOAD_FAST i + # LOAD_CONST 1 + # INPLACE_ADD + i54 = int_add(i45,1) + # STORE_FAST i + # JUMP_ABSOLUTE 21 + #getfield_raw_i(140199654614400, descr=<FieldS pypysig_long_struct.c_value 0>) + #None = i55 < 0 + #guard(i56 is false) + # LOAD_FAST i + #i34 = arraylen_gc(p40, descr=<ArrayS 8>) + jump(i54, i33, p40) + """ + opt = self.vectorize(self.parse_loop(trace)) + self.debug_print_operations(opt.loop) + + class TestLLtype(BaseTestVectorize, LLtypeMixin): pass diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -133,10 +133,6 @@ return False -def cmp_pack_lt(a,b): - return a.left.getindex() < b.left.getindex() -packsort = listsort.make_timsort_class(lt=cmp_pack_lt) - class VectorizingOptimizer(Optimizer): """ Try to unroll the loop and find instructions to group """ @@ -401,13 +397,6 @@ """ if len(self.packset.packs) == 0: raise NotAVectorizeableLoop() - #packsort(self.packset.packs).sort() - #if not we_are_translated(): - # # ensure we are really sorted! - # x = 0 - # for i,pack in enumerate(self.packset.packs): - # assert x <= pack.left.getindex() - # x = pack.left.getindex() i = 0 j = 0 end_ij = len(self.packset.packs) @@ -423,33 +412,6 @@ continue pack1 = self.packset.packs[i] pack2 = self.packset.packs[j] - # remove intermediate - left = pack1.operations[0] - #if left in orphan: - # # a pack was filled, thus the rhs was put - # # into the orphan map. - # if orphan[left] is False: - # # this pack might be redundant if pack1.right - # # is the at the left position in another pack - # assert pack1.opcount() == 2 - # right = pack1.operations[1] - # orphan[right] = True - # pack1.clear() - # del self.packset.packs[i] - # end_ij -= 1 - # continue - # else: - # # left is not an orphan, this pack proves that - # # there might be more packs - # del orphan[left] - # check if the pack is already full - #if pack1.is_full(self.cpu.vector_register_size): - # right = pack1.operations[-1] - # # False indicates that the next pair might not - # # be needed, because left is already computed - # # in another set - # orphan[right] = False - # break if pack1.rightmost_match_leftmost(pack2): end_ij = self.packset.combine(i,j) else: @@ -460,11 +422,14 @@ j = 0 if len_before == len(self.packset.packs): break + newpacks = [] + vec_reg_size = self.cpu.vector_register_size for pack in self.packset.packs: - if pack.pack_byte_size() > self.cpu.vector_register_size: - pack.split(self.packset.packs, self.cpu.vector_register_size) - else: - pack.update_pack_of_nodes() + if pack.pack_load(vec_reg_size) > Pack.FULL: + pack.split(newpacks, vec_reg_size) + continue + pack.update_pack_of_nodes() + self.packset.packs.extend(newpacks) if not we_are_translated(): # some test cases check the accumulation variables @@ -483,9 +448,10 @@ if accum: self.packset.accum_vars[accum.var] = accum.pos - print " %dx %s (accum? %d) " % (len(pack.operations), - pack.operations[0].op.getopname(), - accum is not None) + print " %dx %s " % (len(pack.operations), + pack.operations[0].op.getopname()) + if accum: + print " accumulates!" if fail: assert False _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit