Author: Richard Plangger <r...@pasra.at> Branch: vecopt2 Changeset: r77128:d16c3d437d4e Date: 2015-05-01 09:43 +0200 http://bitbucket.org/pypy/pypy/changeset/d16c3d437d4e/
Log: weaker guards are stripped from the trace quick and dirty implementation to remove redundant index calculations (j=i+1;k=j+1 => j=i+1;k=i+2) consider to move this into the rewrite optimizer (as fijal suggested) diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py --- a/rpython/jit/metainterp/optimizeopt/dependency.py +++ b/rpython/jit/metainterp/optimizeopt/dependency.py @@ -54,7 +54,9 @@ if exclude_last: count -= 1 while i < count: - if not self.path[i].op.has_no_side_effect(): + op = self.path[i].getoperation() + if not op.has_no_side_effect() \ + and op.getopnum() != rop.GUARD_EARLY_EXIT: return False i += 1 return True @@ -62,6 +64,9 @@ def walk(self, node): self.path.append(node) + def cut_off_at(self, index): + self.path = self.path[:index] + def clone(self): return Path(self.path[:]) @@ -89,26 +94,26 @@ def getfailarg_set(self): op = self.getoperation() assert isinstance(op, GuardResOp) - args = [] + args = {} if op.getfailargs(): for arg in op.getfailargs(): - args.append(arg) - return args + args[arg] = None + return args.keys() elif op.rd_snapshot: ss = op.rd_snapshot assert isinstance(ss, Snapshot) while ss: for box in ss.boxes: - args.append(box) + args[box] = None ss = ss.prev - return args + return args.keys() def relax_guard_to(self, guard): """ Relaxes a guard operation to an earlier guard. """ tgt_op = self.getoperation() - op = guard + op = guard.getoperation() assert isinstance(tgt_op, GuardResOp) assert isinstance(op, GuardResOp) #descr = compile.ResumeAtLoopHeaderDescr() @@ -237,24 +242,34 @@ worklist.append(dep.to) return True - def iterate_paths(self, to, backwards=False): + def iterate_paths(self, to, backwards=False, path_max_len=-1): """ yield all nodes from self leading to 'to' """ if self == to: return - worklist = [(Path([self]),self)] + path = Path([self]) + worklist = [(0, self, 1)] while len(worklist) > 0: - path,node = worklist.pop() + index,node,pathlen = worklist.pop() if backwards: iterdir = node.depends() else: iterdir = node.provides() - for dep in iterdir: - cloned_path = path.clone() - cloned_path.walk(dep.to) - if dep.to == to: - yield cloned_path + if index >= len(iterdir): + continue + else: + next_dep = iterdir[index] + next_node = next_dep.to + index += 1 + if index < len(iterdir): + worklist.append((index, node, pathlen)) + path.cut_off_at(pathlen) + path.walk(next_node) + pathlen += 1 + + if next_node is to or (path_max_len > 0 and pathlen >= path_max_len): + yield path else: - worklist.append((cloned_path,dep.to)) + worklist.append((0, next_node, pathlen)) def remove_edge_to(self, node): i = 0 @@ -661,7 +676,10 @@ to = dep.to node.remove_edge_to(to) if not to.emitted and to.depends_count() == 0: - self.schedulable_nodes.append(to) + if to.pack: + self.schedulable_nodes.append(to) + else: + self.schedulable_nodes.insert(0, to) node.clear_dependencies() node.emitted = True @@ -682,6 +700,18 @@ var = self.index_vars[arg] = IndexVar(arg) return var + def operation_INT_LT(self, op, node): + box_a0 = op.getarg(0) + box_a1 = op.getarg(1) + left = None + right = None + if not self.is_const_integral(box_a0): + left = self.get_or_create(box_a0) + if not self.is_const_integral(box_a1): + right = self.get_or_create(box_a1) + box_r = op.result + self.index_vars[box_r] = IndexGuard(op.getopnum(), left, right) + additive_func_source = """ def operation_{name}(self, op, node): box_r = op.result @@ -762,6 +792,25 @@ IntegralForwardModification.inspect_operation = integral_dispatch_opt del integral_dispatch_opt +class IndexGuard(object): + def __init__(self, opnum, lindex_var, rindex_var): + self.opnum = opnum + self.lindex_var = lindex_var + self.rindex_var = rindex_var + + def getindex_vars(self): + if self.lindex_var and self.rindex_var: + return (self.lindex_var, self.rindex_var) + elif self.lindex_var: + return (self.lindex_var,) + elif self.rindex_var: + return (self.rindex_var,) + else: + assert False, "integer comparison must have left or right index" + + def adapt_operation(self, op): + pass + class IndexVar(object): def __init__(self, var): self.var = var @@ -769,6 +818,9 @@ self.coefficient_div = 1 self.constant = 0 + def getvariable(self): + return self.var + def __eq__(self, other): if self.same_variable(other): return self.diff(other) == 0 @@ -777,6 +829,10 @@ def __ne__(self, other): return not self.__eq__(other) + def less(self, other): + if self.same_variable(other): + return self.diff(other) < 0 + def clone(self): c = IndexVar(self.var) c.coefficient_mul = self.coefficient_mul @@ -799,6 +855,18 @@ return 'IndexVar(%s*(%s/%s)+%s)' % (self.var, self.coefficient_mul, self.coefficient_div, self.constant) + def adapt_operation(self, op): + # TODO + if self.coefficient_mul == 1 and \ + self.coefficient_div == 1 and \ + op.getopnum() == rop.INT_ADD: + if isinstance(op.getarg(0), Box) and isinstance(op.getarg(1), Const): + op.setarg(0, self.var) + op.setarg(1, ConstInt(self.constant)) + elif isinstance(op.getarg(1), Box) and isinstance(op.getarg(0), Const): + op.setarg(1, self.var) + op.setarg(0, ConstInt(self.constant)) + class MemoryRef(object): """ a memory reference to an array object. IntegralForwardModification is able to propagate changes to this object if applied in backwards direction. diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py --- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py @@ -45,7 +45,7 @@ node_b = graph.getnode(idx_b) dependency = node_a.getedge_to(node_b) if dependency is None and idx_b not in exceptions.setdefault(idx,[]): - #self._write_dot_and_convert_to_svg(graph, graph.nodes, 'except') + self._write_dot_and_convert_to_svg(graph, 'except') assert dependency is not None, \ " it is expected that instruction at index" + \ " %s depends on instr on index %s but it does not.\n%s" \ diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -26,7 +26,7 @@ class VecTestHelper(DependencyBaseTest): - enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unfold" + enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap" jitdriver_sd = FakeJitDriverStaticData() @@ -57,6 +57,10 @@ raise NotAVectorizeableLoop() if unroll_factor == -1: unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE) + opt.analyse_index_calculations() + if opt.dependency_graph is not None: + self._write_dot_and_convert_to_svg(opt.dependency_graph, "ee" + self.test_name) + opt.schedule() opt.unroll_loop_iterations(loop, unroll_factor) opt.loop.operations = opt.get_newoperations() opt.clear_newoperations() @@ -91,6 +95,16 @@ opt.schedule() return opt + def vectorize(self, loop, unroll_factor = -1): + opt = self.vectoroptimizer_unrolled(loop, unroll_factor) + opt.find_adjacent_memory_refs() + opt.extend_packset() + opt.combine_packset() + opt.schedule() + opt.collapse_index_guards() + self._do_optimize_loop(loop, {}, export_state=False) + return opt + def assert_unroll_loop_equals(self, loop, expected_loop, \ unroll_factor = -1): vectoroptimizer = self.vectoroptimizer_unrolled(loop, unroll_factor) @@ -696,12 +710,12 @@ loop = self.parse_loop(ops) vopt = self.extend_packset(loop,1) assert len(vopt.dependency_graph.memory_refs) == 4 + self.assert_independent(4,10) self.assert_independent(5,11) self.assert_independent(6,12) - self.assert_independent(7,13) assert len(vopt.packset.packs) == 3 self.assert_packset_empty(vopt.packset, len(loop.operations), - [(6,12), (5,11), (7,13)]) + [(5,11), (4,10), (6,12)]) @pytest.mark.parametrize("descr", ['char','float','int','singlefloat']) def test_packset_combine_simple(self,descr): @@ -853,8 +867,8 @@ i11 = int_le(i1, 128) guard_true(i11) [] i12 = int_add(i1, {stride}) + v2 = vec_getarrayitem_raw(p1, i0, 2, descr={descr}arraydescr) v1 = vec_getarrayitem_raw(p0, i0, 2, descr={descr}arraydescr) - v2 = vec_getarrayitem_raw(p1, i0, 2, descr={descr}arraydescr) v3 = {op}(v1,v2,2) vec_setarrayitem_raw(p2, i0, v3, 2, descr={descr}arraydescr) jump(p0,p1,p2,i12) @@ -919,6 +933,7 @@ """ opt=""" [i0, i1, i2, i3, i4] + i6 = int_mul(i0, 8) i11 = int_add(i0, 1) i12 = int_lt(i11, i1) guard_true(i12) [] @@ -926,9 +941,8 @@ i13 = int_add(i11, 1) i18 = int_lt(i13, i1) guard_true(i18) [] - i6 = int_mul(i0, 8) + v20 = vec_raw_load(i3, i6, 2, descr=intarraydescr) v19 = vec_raw_load(i2, i6, 2, descr=intarraydescr) - v20 = vec_raw_load(i3, i6, 2, descr=intarraydescr) v21 = vec_int_add(v19, v20, 2) vec_raw_store(i4, i6, v21, 2, descr=intarraydescr) jump(i13, i1, i2, i3, i4) @@ -976,22 +990,27 @@ def test_collapse_index_guard_1(self): ops = """ [p0,i0] - guard_early_exit() [] + guard_early_exit() [p0,i0] i1 = getarrayitem_raw(p0, i0, descr=intarraydescr) i2 = int_add(i0, 1) i3 = int_lt(i2, 102) guard_true(i3) [p0,i0] jump(p0,i2) """ + dead_code = '\n '.join([ + "i{t} = int_add(i0,{i})\n i{s} = int_lt(i{t}, 102)".format( + i=i+1, t=i+4, s=i+20) + for i in range(0,15)]) opt=""" [p0,i0] + {dead_code} i2 = int_add(i0, 16) i3 = int_lt(i2, 102) guard_true(i3) [p0,i0] i1 = vec_getarrayitem_raw(p0, i0, 16, descr=intarraydescr) jump(p0,i2) - """ - vopt = self.schedule(self.parse_loop(ops),15) + """.format(dead_code=dead_code) + vopt = self.vectorize(self.parse_loop(ops),15) self.assert_equal(vopt.loop, self.parse_loop(opt)) diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -53,19 +53,22 @@ def_opt = Optimizer(metainterp_sd, jitdriver_sd, loop, optimizations) def_opt.propagate_all_forward() +#class CollapseGuardOptimization(Optimization): +# def __init__(self, index_vars = None): +# self.index_vars = index_vars or {} +# +# def propagate_forward( + class VectorizingOptimizer(Optimizer): """ Try to unroll the loop and find instructions to group """ def __init__(self, metainterp_sd, jitdriver_sd, loop, optimizations): Optimizer.__init__(self, metainterp_sd, jitdriver_sd, loop, optimizations) - self.memory_refs = [] self.dependency_graph = None - self.first_debug_merge_point = False self.packset = None self.unroll_count = 0 self.smallest_type_bytes = 0 - self.early_exit = None - self.future_condition = None + self.early_exit_idx = -1 def propagate_all_forward(self, clear=True): self.clear_newoperations() @@ -74,7 +77,6 @@ if jump.getopnum() not in (rop.LABEL, rop.JUMP): # compile_loop appends a additional label to all loops # we cannot optimize normal traces - assert False raise NotAVectorizeableLoop() self.linear_find_smallest_type(self.loop) @@ -85,6 +87,12 @@ # we cannot optimize normal traces (if there is no label) raise NotAVectorizeableLoop() + # find index guards and move to the earliest position + self.analyse_index_calculations() + if self.dependency_graph is not None: + self.schedule() # reorder the trace + + # unroll self.unroll_count = self.get_unroll_count(vsize) self.unroll_loop_iterations(self.loop, self.unroll_count) @@ -96,12 +104,13 @@ self.find_adjacent_memory_refs() self.extend_packset() self.combine_packset() - self.collapse_index_guards() self.schedule() + self.collapse_index_guards() + def emit_operation(self, op): - if op.getopnum() == rop.GUARD_EARLY_EXIT or \ - op.getopnum() == rop.DEBUG_MERGE_POINT: + #if op.getopnum() == rop.GUARD_EARLY_EXIT or \ + if op.getopnum() == rop.DEBUG_MERGE_POINT: return self._last_emitted_op = op self._newoperations.append(op) @@ -114,6 +123,7 @@ """ Unroll the loop X times. unroll_count is an integral how often to further unroll the loop. """ + op_count = len(loop.operations) label_op = loop.operations[0].clone() @@ -125,7 +135,7 @@ jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, label_op.getdescr()) else: jump_op = jump_op.clone() - jump_op.setdescr(label_op.getdescr()) + #jump_op.setdescr(label_op.getdescr()) assert jump_op.is_final() self.emit_unrolled_operation(label_op) @@ -134,12 +144,11 @@ #self.emit_unrolled_operation(guard_ee_op) operations = [] + start_index = 1 for i in range(1,op_count-1): op = loop.operations[i].clone() - if loop.operations[i].getopnum() == rop.GUARD_FUTURE_CONDITION: - pass if loop.operations[i].getopnum() == rop.GUARD_EARLY_EXIT: - self.future_condition = op + continue operations.append(op) self.emit_unrolled_operation(op) @@ -157,11 +166,13 @@ if la != ja: rename_map[la] = ja # + emitted_ee = False for op in operations: if op.getopnum() == rop.GUARD_FUTURE_CONDITION: continue # do not unroll this operation twice if op.getopnum() == rop.GUARD_EARLY_EXIT: - continue # do not unroll this operation twice + emitted_ee = True + pass # do not unroll this operation twice copied_op = op.clone() if copied_op.result is not None: # every result assigns a new box, thus creates an entry @@ -180,7 +191,7 @@ # not only the arguments, but also the fail args need # to be adjusted. rd_snapshot stores the live variables # that are needed to resume. - if copied_op.is_guard(): + if copied_op.is_guard() and emitted_ee: assert isinstance(copied_op, GuardResOp) snapshot = self.clone_snapshot(copied_op.rd_snapshot, rename_map) copied_op.rd_snapshot = snapshot @@ -231,6 +242,8 @@ def linear_find_smallest_type(self, loop): # O(#operations) for i,op in enumerate(loop.operations): + if op.getopnum() == rop.GUARD_EARLY_EXIT: + self.early_exit_idx = i if op.is_array_op(): descr = op.getdescr() if not descr.is_array_of_pointers(): @@ -250,7 +263,6 @@ def build_dependency_graph(self): self.dependency_graph = DependencyGraph(self.loop.operations) - self.relax_index_guards() def find_adjacent_memory_refs(self): """ the pre pass already builds a hash of memory references and the @@ -346,12 +358,11 @@ break def schedule(self): + self.guard_early_exit = -1 self.clear_newoperations() scheduler = Scheduler(self.dependency_graph, VecScheduleData()) - #dprint("scheduling loop. scheduleable are: " + str(scheduler.schedulable_nodes)) while scheduler.has_more(): candidate = scheduler.next() - #dprint(" candidate", candidate, "has pack?", candidate.pack != None, "pack", candidate.pack) if candidate.pack: pack = candidate.pack if scheduler.schedulable(pack.operations): @@ -362,8 +373,6 @@ else: scheduler.schedule_later(0) else: - if candidate.getopnum() == rop.GUARD_EARLY_EXIT: - pass position = len(self._newoperations) self.emit_operation(candidate.getoperation()) scheduler.schedule(0, position) @@ -372,69 +381,90 @@ for node in self.dependency_graph.nodes: assert node.emitted self.loop.operations = self._newoperations[:] + self.clear_newoperations() - def relax_index_guards(self): - label_idx = 0 - early_exit_idx = 1 - label = self.dependency_graph.getnode(label_idx) - ee_guard = self.dependency_graph.getnode(early_exit_idx) - if not ee_guard.is_guard_early_exit(): - return # cannot relax + def analyse_index_calculations(self): + if len(self.loop.operations) <= 1 or self.early_exit_idx == -1: + return - #self.early_exit = ee_guard + self.dependency_graph = dependencies = DependencyGraph(self.loop.operations) - for guard_node in self.dependency_graph.guards: - if guard_node == ee_guard: - continue - if guard_node.getopnum() not in (rop.GUARD_TRUE,rop.GUARD_FALSE): + label_node = dependencies.getnode(0) + ee_guard_node = dependencies.getnode(self.early_exit_idx) + guards = dependencies.guards + fail_args = [] + for guard_node in guards: + if guard_node is ee_guard_node: continue del_deps = [] pullup = [] - iterb = guard_node.iterate_paths(ee_guard, True) last_prev_node = None - for path in iterb: + for path in guard_node.iterate_paths(ee_guard_node, True): prev_node = path.second() - if fail_args_break_dependency(guard_node, prev_node, ee_guard): + if fail_args_break_dependency(guard_node, prev_node, ee_guard_node): if prev_node == last_prev_node: continue - dprint("relax) ", prev_node, "=>", guard_node) - del_deps.append((prev_node,guard_node)) + del_deps.append((prev_node, guard_node)) else: - pullup.append(path) + if path.has_no_side_effects(exclude_first=True, exclude_last=True): + #index_guards[guard.getindex()] = IndexGuard(guard, path.path[:]) + pullup.append(path.last_but_one()) last_prev_node = prev_node for a,b in del_deps: a.remove_edge_to(b) - for candidate in pullup: - lbo = candidate.last_but_one() - if candidate.has_no_side_effects(exclude_first=True, exclude_last=True): - ee_guard.remove_edge_to(lbo) - label.edge_to(lbo, label='pullup') - guard_node.edge_to(ee_guard, label='pullup') - label.remove_edge_to(ee_guard) - - guard_node.relax_guard_to(self.future_condition) + for lbo in pullup: + if lbo is ee_guard_node: + continue + ee_guard_node.remove_edge_to(lbo) + label_node.edge_to(lbo, label='pullup') + # only the last guard needs a connection + guard_node.edge_to(ee_guard_node, label='pullup-last-guard') + guard_node.relax_guard_to(ee_guard_node) def collapse_index_guards(self): - pass - #final_ops = [] - #last_guard = None - #is_after_relax = False - #for op in self._newoperations: - # if op.getopnum() == rop.GUARD_EARLY_EXIT: - # assert last_guard is not None - # final_ops.append(last_guard) - # is_after_relax = True - # continue - # if not is_after_relax: - # if op.is_guard(): - # last_guard = op - # else: - # final_ops.append(op) - # else: - # final_ops.append(op) - #assert is_after_relax - #return final_ops + strongest_guards = {} + strongest_guards_var = {} + index_vars = self.dependency_graph.index_vars + operations = self.loop.operations + var_for_guard = {} + for i in range(len(operations)-1, -1, -1): + op = operations[i] + if op.is_guard(): + for arg in op.getarglist(): + var_for_guard[arg] = True + try: + comparison = index_vars[arg] + for index_var in comparison.getindex_vars(): + var = index_var.getvariable() + strongest_known = strongest_guards_var.get(var, None) + if not strongest_known: + strongest_guards_var[var] = index_var + continue + if index_var.less(strongest_known): + strongest_guards_var[var] = strongest_known + strongest_guards[op] = strongest_known + except KeyError: + pass + last_op_idx = len(operations)-1 + for op in operations: + if op.is_guard(): + stronger_guard = strongest_guards.get(op, None) + if stronger_guard: + # there is a stronger guard + continue + else: + self.emit_operation(op) + continue + if op.is_always_pure() and op.result: + try: + var_index = index_vars[op.result] + var_index.adapt_operation(op) + except KeyError: + pass + self.emit_operation(op) + + self.loop.operations = self._newoperations[:] def must_unpack_result_to_exec(op, target_op): # TODO either move to resop or util @@ -445,7 +475,6 @@ def prohibit_packing(op1, op2): if op1.is_array_op(): if op1.getarg(1) == op2.result: - dprint("prohibit)", op1, op2) return True return False _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit