Author: Richard Plangger <planri...@gmail.com> Branch: vecopt-merge Changeset: r79589:7783b6299af3 Date: 2015-09-11 12:51 +0200 http://bitbucket.org/pypy/pypy/changeset/7783b6299af3/
Log: adjusting the resop creation to remove the vectorbox, work in progress diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -8,33 +8,44 @@ from rpython.jit.metainterp.jitexc import NotAProfitableLoop -class SchedulerData(object): - pass +class SchedulerState(object): + def __init__(self, graph): + self.renamer = Renamer() + self.graph = graph + self.oplist = [] + self.worklist = [] + + def post_schedule(self): + pass + + def profitable(self): + return self.costmodel.profitable() + + def prepare(self): + pass + + def has_more(self): + return len(self.worklist) > 0 + class Scheduler(object): """ The base class to be instantiated to (re)schedule a vector trace. """ - def __init__(self, graph, sched_data): - assert isinstance(sched_data, SchedulerData) - self.graph = graph - self.schedulable_nodes = self.graph.schedulable_nodes - self.sched_data = sched_data - self.oplist = None - self.renamer = None + def __init__(self): + pass - def has_more(self): - return len(self.schedulable_nodes) > 0 - - def next_index(self, candidate_list): - i = len(candidate_list)-1 - while i >= 0: - candidate = candidate_list[i] - if candidate.emitted: - del candidate_list[i] - i -= 1 + def next(self, state): + worklist = state.worklist + visited = 0 + while len(worklist) > 0: + if visited == len(worklist): + return None + node = worklist.pop() + if node.emitted: continue - if self.schedulable(candidate): - return i - i -= 1 - return -1 + if self.schedulable(node): + return node + worklist.insert(0, node) + visited += 1 + return None def schedulable(self, candidate): """ Is the candidate scheduleable? Boils down to dependency_count == 0 @@ -53,12 +64,14 @@ return False return candidate.depends_count() == 0 - def scheduled(self, node): + def scheduled(self, node, state): """ Call this function if an operation has been emitted adds new operations to the schedule list if their dependency count drops to zero. In addition it keeps the list sorted (see priority) """ + state.renamer.rename(op) + state.unpack_from_vector(op, self) node.position = len(self.oplist) for dep in node.provides()[:]: # COPY to = dep.to @@ -85,36 +98,37 @@ node.clear_dependencies() node.emitted = True - def emit_into(self, oplist, renamer, unpack=False): + def walk_and_emit(self, state): # TODO oplist, renamer, unpack=False): """ Emit all the operations into the oplist parameter. Initiates the scheduling. """ - self.renamer = renamer - self.oplist = oplist - self.unpack = unpack + assert isinstance(state, SchedulerState) + while state.has_more(): + node = self.next(state) + if node: + if not state.emit(node): + if not node.emitted: + op = node.getoperation() + scheduler.scheduled(node, state) + continue - while self.has_more(): - i = self.next_index(self.schedulable_nodes) - if i >= 0: - candidate = self.schedulable_nodes[i] - del self.schedulable_nodes[i] - self.sched_data.schedule_candidate(self, candidate) - continue + # it happens that packs can emit many nodes that have been # added to the scheuldable_nodes list, in this case it could # be that no next exists even though the list contains elements - if not self.has_more(): + if not state.has_more(): break raise AssertionError("schedule failed cannot continue. possible reason: cycle") - jump_node = self.graph.nodes[-1] - jump_op = jump_node.getoperation() - renamer.rename(jump_op) - assert jump_op.getopnum() == rop.JUMP - self.sched_data.unpack_from_vector(jump_op, self) - oplist.append(jump_op) + # TODO + #jump_node = self.graph.nodes[-1] + #jump_op = jump_node.getoperation() + #renamer.rename(jump_op) + #assert jump_op.getopnum() == rop.JUMP + #self.sched_data.unpack_from_vector(jump_op, self) + #oplist.append(jump_op) def vectorbox_outof_box(box, count=-1, size=-1, type='-'): if box.type not in (FLOAT, INT): @@ -178,7 +192,7 @@ @staticmethod def of(box, count=-1): - assert isinstance(box, BoxVector) + assert box.type == 'V' if count == -1: count = box.getcount() return PackType(box.gettype(), box.getsize(), box.getsigned(), count) @@ -210,6 +224,7 @@ assert count > 1 assert self.type in ('i','f') assert self.size > 0 + xxx return BoxVector(self.type, count, self.size, self.signed) def combine(self, other): @@ -312,10 +327,9 @@ self.before_argument_transform(args) self.transform_arguments(args) # - result = op.result - result = self.transform_result(result) + vop = ResOperation(op.vector, args, op.getdescr()) + #result = self.transform_result(op) # - vop = ResOperation(op.vector, args, result, op.getdescr()) if op.is_guard(): assert isinstance(op, GuardResOp) assert isinstance(vop, GuardResOp) @@ -334,7 +348,7 @@ if i >= vbox.getcount(): break op = node.getoperation() - self.sched_data.setvector_of_box(op.result, i, vbox) + self.sched_data.setvector_of_box(op, i, vbox) return vbox def new_result_vector_box(self): @@ -348,9 +362,18 @@ return self.pack.operations def transform_arguments(self, args): - """ Transforming one argument to a vector box argument """ + """ Transforming one argument to a vector box argument + The following cases can occur: + 1) argument is present in the box_to_vbox map. + a) vector can be reused immediatly (simple case) + b) vector is to big + c) vector is to small + 2) argument is not known to reside in a vector + a) expand vars/consts before the label and add as argument + b) expand vars created in the loop body + """ for i,arg in enumerate(args): - if isinstance(arg, BoxVector): + if arg.returns_vector(): continue if not self.is_vector_arg(i): continue @@ -478,7 +501,7 @@ return new_box def _check_vec_pack(self, op): - result = op.result + result = op arg0 = op.getarg(0) arg1 = op.getarg(1) index = op.getarg(2) @@ -754,63 +777,89 @@ raise NotImplementedError("missing vecop for '%s'" % (op.getopname(),)) return op2vecop -class VecScheduleData(SchedulerData): - def __init__(self, vec_reg_size, costmodel, inputargs): +class VecScheduleState(SchedulerState): + def __init__(self, graph, packset, cpu, costmodel): + SchedulerState.__init__(self, graph) self.box_to_vbox = {} - self.vec_reg_size = vec_reg_size + self.cpu = cpu + self.vec_reg_size = cpu.vector_register_size self.invariant_oplist = [] self.invariant_vector_vars = [] self.expanded_map = {} self.costmodel = costmodel self.inputargs = {} - for arg in inputargs: + self.packset = packset + for arg in graph.loop.inputargs: self.inputargs[arg] = None self.seen = {} - def schedule_candidate(self, scheduler, candidate): + def post_schedule(self): + pass + # TODO label rename + if vector: + # XXX + # add accumulation info to the descriptor + #for version in self.loop.versions: + # # this needs to be done for renamed (accum arguments) + # version.renamed_inputargs = [ renamer.rename_map.get(arg,arg) for arg in version.inputargs ] + #self.appended_arg_count = len(sched_data.invariant_vector_vars) + ##for guard_node in graph.guards: + ## op = guard_node.getoperation() + ## failargs = op.getfailargs() + ## for i,arg in enumerate(failargs): + ## if arg is None: + ## continue + ## accum = arg.getaccum() + ## if accum: + ## pass + ## #accum.save_to_descr(op.getdescr(),i) + #self.has_two_labels = len(sched_data.invariant_oplist) > 0 + #self.loop.operations = self.prepend_invariant_operations(sched_data) + pass + + + def profitable(self): + return self.costmodel.profitable() + + def prepare(self): + SchedulerState.prepare(self) + self.graph.prepare_for_scheduling() + self.packset.accumulate_prepare(self) + for arg in self.graph.loop.label.getarglist(): + self.seen[arg] = None + + def emit(self, node, scheduler): """ If you implement a scheduler this operations is called to emit the actual operation into the oplist of the scheduler. """ - renamer = scheduler.renamer - if candidate.pack: - for node in candidate.pack.operations: - renamer.rename(node.getoperation()) + if node.pack: + for node in node.pack.operations: scheduler.scheduled(node) - self.as_vector_operation(scheduler, candidate.pack) - else: - op = candidate.getoperation() - renamer.rename(op) - self.unpack_from_vector(op, scheduler) - scheduler.scheduled(candidate) - op = candidate.getoperation() - # - # prevent some instructions in the resulting trace! - if op.getopnum() in (rop.DEBUG_MERGE_POINT, - rop.GUARD_EARLY_EXIT): - return - scheduler.oplist.append(op) + self.as_vector_operation(node.pack) + return True + return False - def as_vector_operation(self, scheduler, pack): + + def as_vector_operation(self, pack): """ Transform a pack into a single or several operation. Calls the as_vector_operation of the OpToVectorOp implementation. """ assert pack.opcount() > 1 # properties that hold for the pack are: - # + isomorphism (see func above) + # + isomorphism (see func) # + tightly packed (no room between vector elems) - oplist = scheduler.oplist - position = len(oplist) - op = pack.operations[0].getoperation() - determine_trans(op).as_vector_operation(pack, self, scheduler, oplist) + position = len(self.oplist) + op = pack.leftmost().getoperation() + determine_trans(op).as_vector_operation(pack, self, self.oplist) # if pack.is_accumulating(): box = oplist[position].result assert box is not None for node in pack.operations: op = node.getoperation() - assert op.result is not None - scheduler.renamer.start_renaming(op.result, box) + assert not op.returns_void() + scheduler.renamer.start_renaming(op, box) def unpack_from_vector(self, op, scheduler): """ If a box is needed that is currently stored within a vector @@ -820,17 +869,17 @@ # unpack for an immediate use for i, arg in enumerate(op.getarglist()): - if isinstance(arg, Box): + if not arg.is_constant(): argument = self._unpack_from_vector(i, arg, scheduler) if arg is not argument: op.setarg(i, argument) - if op.result: - self.seen[op.result] = None + if not op.returns_void(): + self.seen[op] = None # unpack for a guard exit if op.is_guard(): fail_args = op.getfailargs() for i, arg in enumerate(fail_args): - if arg and isinstance(arg, Box): + if arg and not arg.is_constant(): argument = self._unpack_from_vector(i, arg, scheduler) if arg is not argument: fail_args[i] = argument @@ -865,7 +914,7 @@ def setvector_of_box(self, box, off, vector): assert off < vector.getcount() - assert not isinstance(box, BoxVector) + assert box.type != 'V' self.box_to_vbox[box] = (off, vector) def opcount_filling_vector_register(pack, vec_reg_size): diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py --- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py @@ -6,13 +6,28 @@ from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, Dependency, IndexVar, MemoryRef, Node) -from rpython.jit.metainterp.optimizeopt.vector import TraceLoop +from rpython.jit.metainterp.optimizeopt.vector import VectorLoop from rpython.jit.metainterp.resoperation import rop, ResOperation from rpython.jit.backend.llgraph.runner import ArrayDescr from rpython.rtyper.lltypesystem import rffi from rpython.rtyper.lltypesystem import lltype from rpython.conftest import option +class FakeDependencyGraph(DependencyGraph): + """ A dependency graph that is able to emit every instruction + one by one. """ + def __init__(self, loop): + self.loop = loop + if isinstance(loop, list): + self.nodes = loop + else: + operations = loop.operations + self.nodes = [Node(op,i) for i,op in \ + enumerate(operations)] + self.schedulable_nodes = list(reversed(self.nodes)) + self.guards = [] + + class DependencyBaseTest(BaseTest): def setup_method(self, method): @@ -26,12 +41,20 @@ assert node.independent(node) return self.last_graph - def parse_loop(self, ops): + def parse_loop(self, ops, add_label=True, **kwargs): loop = self.parse(ops, postprocess=self.postprocess) + loop.operations = filter(lambda op: op.getopnum() != rop.DEBUG_MERGE_POINT, loop.operations) token = JitCellToken() - label = ResOperation(rop.LABEL, loop.inputargs, descr=TargetToken(token)) - loop = TraceLoop(label, loop.operations[:-1], loop.operations[-1]) + if add_label: + label = ResOperation(rop.LABEL, loop.inputargs, descr=TargetToken(token)) + else: + label = loop.operations[0] + label.setdescr(TargetToken(token)) + loop = VectorLoop(label, loop.operations[1:-1], loop.operations[-1]) loop.jump.setdescr(token) + for op in loop.operations: + if op.getopnum() == rop.GUARD_EARLY_EXIT and op.getdescr() is None: + op.setdescr(compile.ResumeAtLoopHeaderDescr()) return loop def assert_edges(self, graph, edge_list, exceptions): @@ -533,8 +556,7 @@ n1,n2 = FakeNode(1), FakeNode(2) n1.edge_to(n2); n2.edge_to(n1) - graph = FakeDependencyGraph() - graph.nodes = [n1,n2] + graph = FakeDependencyGraph([n1,n2]) cycle = graph.cycles() assert cycle == [n1, n2] @@ -547,7 +569,7 @@ n1,n2,n3,n4 = FakeNode(1), FakeNode(2), FakeNode(3), FakeNode(4) n1.edge_to(n3); n3.edge_to(n4); n4.edge_to(n1) - graph = FakeDependencyGraph() + graph = FakeDependencyGraph([n1,n2]) graph.nodes = [n1,n2,n3] cycle = graph.cycles() assert cycle is not None @@ -584,10 +606,6 @@ def __repr__(self): return "n%d" % self.opidx -class FakeDependencyGraph(DependencyGraph): - def __init__(self): - pass - class TestLLtype(BaseTestDependencyGraph, LLtypeMixin): pass diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py --- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py @@ -3,13 +3,14 @@ from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop from rpython.jit.metainterp.optimizeopt.util import equaloplists from rpython.jit.metainterp.optimizeopt.renamer import Renamer -from rpython.jit.metainterp.optimizeopt.vec import (VecScheduleData, +from rpython.jit.metainterp.optimizeopt.vector import (VecScheduleState, Pack, Pair, NotAProfitableLoop, VectorizingOptimizer, X86_CostModel, PackSet) from rpython.jit.metainterp.optimizeopt.dependency import Node, DependencyGraph -from rpython.jit.metainterp.optimizeopt.schedule import PackType +from rpython.jit.metainterp.optimizeopt.schedule import PackType, Scheduler from rpython.jit.metainterp.optimizeopt.test.test_util import LLtypeMixin -from rpython.jit.metainterp.optimizeopt.test.test_dependency import DependencyBaseTest +from rpython.jit.metainterp.optimizeopt.test.test_dependency import (DependencyBaseTest, + FakeDependencyGraph) from rpython.jit.metainterp.optimizeopt.test.test_vecopt import (FakeMetaInterpStaticData, FakeJitDriverStaticData) from rpython.jit.metainterp.resoperation import rop, ResOperation @@ -29,19 +30,10 @@ self.packs = packs self.vec_reg_size = 16 -class FakeDependencyGraph(DependencyGraph): - """ A dependency graph that is able to emit every instruction - one by one. """ - def __init__(self, loop): - self.nodes = [Node(op,i) for i,op in \ - enumerate(loop.operations)] - self.schedulable_nodes = list(reversed(self.nodes)) - self.guards = [] - class SchedulerBaseTest(DependencyBaseTest): - def namespace(self): - return { + def setup_class(self): + self.namespace = { 'double': self.floatarraydescr, 'float': self.float32arraydescr, 'long': self.arraydescr, @@ -50,12 +42,8 @@ 'char': self.chararraydescr, } - def parse(self, source, inc_label_jump=True, - pargs=2, - iargs=10, - fargs=6, - additional_args=None, - replace_args=None): + def parse_trace(self, source, inc_label_jump=True, pargs=2, iargs=10, + fargs=6, additional_args=None, replace_args=None): args = [] for prefix, rang in [('p',range(pargs)), ('i',range(iargs)), @@ -75,16 +63,8 @@ joinedargs = ','.join(args) fmt = (indent, joinedargs, source, indent, joinedargs) src = "%s[%s]\n%s\n%sjump(%s)" % fmt - loop = opparse(src, cpu=self.cpu, namespace=self.namespace()) - if inc_label_jump: - token = JitCellToken() - label = ResOperation(rop.LABEL, loop.inputargs, descr=TargetToken(token)) - loop.operations = [label] + loop.operations - loop.graph = FakeDependencyGraph(loop) - return loop - else: - loop.graph = FakeDependencyGraph(loop) - del loop.operations[-1] + loop = self.parse_loop(src) + loop.graph = FakeDependencyGraph(loop) return loop def pack(self, loop, l, r, input_type, output_type): @@ -92,21 +72,8 @@ def schedule(self, loop, packs, vec_reg_size=16, prepend_invariant=False, overwrite_funcs=None): - ops = [] - cm = X86_CostModel(0, vec_reg_size) - def profitable(): - return True - cm.profitable = profitable - vsd = VecScheduleData(vec_reg_size, cm, loop.inputargs[:]) - for name, overwrite in (overwrite_funcs or {}).items(): - setattr(vsd, name, overwrite) - renamer = Renamer() - metainterp_sd = FakeMetaInterpStaticData(self.cpu) - jitdriver_sd = FakeJitDriverStaticData() - opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 0) - opt.costmodel = cm - opt.dependency_graph = loop.graph - del loop.graph + cm = X86_CostModel(self.cpu, 0) + cm.profitable = lambda: True pairs = [] for pack in packs: for i in range(len(pack.operations)-1): @@ -115,29 +82,39 @@ o2 = pack.operations[i+1] pair = Pair(o1,o2,pack.input_type,pack.output_type) pairs.append(pair) - - opt.packset = FakePackSet(pairs) - + packset = FakePackSet(pairs) + state = VecScheduleState(loop.graph, packset, self.cpu, cm) + for name, overwrite in (overwrite_funcs or {}).items(): + setattr(state, name, overwrite) + renamer = Renamer() + metainterp_sd = FakeMetaInterpStaticData(self.cpu) + jitdriver_sd = FakeJitDriverStaticData() + opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, 0) + opt.packset = packset if not prepend_invariant: - def pio(oplist, labels): - return oplist - vsd.prepend_invariant_operations = pio - + state.prepend_invariant_operations = lambda list, _: list opt.combine_packset() - opt.schedule(True, sched_data=vsd) - - loop.operations = \ - [op for op in loop.operations \ - if not (op.is_final() or op.is_label())] - - return loop - - def assert_operations_match(self, loop_a, loop_b): - assert equaloplists(loop_a.operations, loop_b.operations) + opt.schedule(state) class Test(SchedulerBaseTest, LLtypeMixin): + + def test_next_must_not_loop_forever(self): + scheduler = Scheduler() + def schedulable(node): + node.count += 1 + return False + scheduler.schedulable = schedulable + class State(object): pass + class Node(object): emitted = False; pack = None; count = 0 + state = State() + state.worklist = [Node(), Node(), Node(), Node(), Node()] + assert scheduler.next(state) is None + for node in state.worklist: + assert node.count == 1 + # must return here, then the test passed + def test_schedule_split_load(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" f10 = raw_load_f(p0, i0, descr=float) f11 = raw_load_f(p0, i1, descr=float) f12 = raw_load_f(p0, i2, descr=float) @@ -147,15 +124,15 @@ """) pack1 = self.pack(loop1, 0, 6, None, F32) loop2 = self.schedule(loop1, [pack1]) - loop3 = self.parse(""" - v10[i32|4] = vec_raw_load(p0, i0, 4, descr=float) + loop3 = self.parse_trace(""" + v10[i32|4] = vec_raw_load_i(p0, i0, 4, descr=float) f10 = raw_load_f(p0, i4, descr=float) f11 = raw_load_f(p0, i5, descr=float) """, False) self.assert_equal(loop2, loop3) def test_int_to_float(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = raw_load(p0, i0, descr=long) i11 = raw_load(p0, i1, descr=long) i12 = int_signext(i10, 4) @@ -167,21 +144,21 @@ pack2 = self.pack(loop1, 2, 4, I64, I32_2) pack3 = self.pack(loop1, 4, 6, I32_2, F32_2) loop2 = self.schedule(loop1, [pack1, pack2, pack3]) - loop3 = self.parse(""" - v10[i64|2] = vec_raw_load(p0, i0, 2, descr=long) + loop3 = self.parse_trace(""" + v10[i64|2] = vec_raw_load_i(p0, i0, 2, descr=long) v20[i32|2] = vec_int_signext(v10[i64|2], 4) v30[f64|2] = vec_cast_int_to_float(v20[i32|2]) """, False) self.assert_equal(loop2, loop3) def test_scalar_pack(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = int_add(i0, 73) i11 = int_add(i1, 73) """) pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v10[i64|2] = vec_box(2) v20[i64|2] = vec_int_pack(v10[i64|2], i0, 0, 1) v30[i64|2] = vec_int_pack(v20[i64|2], i1, 1, 1) @@ -191,13 +168,13 @@ """, False) self.assert_equal(loop2, loop3) - loop1 = self.parse(""" + loop1 = self.parse_trace(""" f10 = float_add(f0, 73.0) f11 = float_add(f1, 73.0) """) pack1 = self.pack(loop1, 0, 2, F64, F64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v10[f64|2] = vec_box(2) v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1) v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1) @@ -208,7 +185,7 @@ self.assert_equal(loop2, loop3) def test_scalar_remember_expansion(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" f10 = float_add(f0, f5) f11 = float_add(f1, f5) f12 = float_add(f10, f5) @@ -217,7 +194,7 @@ pack1 = self.pack(loop1, 0, 2, F64, F64) pack2 = self.pack(loop1, 2, 4, F64, F64) loop2 = self.schedule(loop1, [pack1, pack2], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v10[f64|2] = vec_box(2) v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1) v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1) @@ -235,7 +212,7 @@ raise Exception("could not find %s in args %s" % (name, loop.inputargs)) def test_signext_int32(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = int_signext(i1, 4) i11 = int_signext(i1, 4) """, additional_args=['v10[i64|2]']) @@ -247,13 +224,13 @@ overwrite_funcs = { 'getvector_of_box': i1inv103204, }) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v11[i32|2] = vec_int_signext(v10[i64|2], 4) """, False, additional_args=['v10[i64|2]']) self.assert_equal(loop2, loop3) def test_cast_float_to_int(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" f10 = raw_load(p0, i1, descr=double) f11 = raw_load(p0, i2, descr=double) f12 = raw_load(p0, i3, descr=double) @@ -301,11 +278,11 @@ overwrite_funcs={ '_prevent_signext': void }) - loop3 = self.parse(""" - v10[f64|2] = vec_raw_load(p0, i1, 2, descr=double) - v11[f64|2] = vec_raw_load(p0, i3, 2, descr=double) - v12[f64|2] = vec_raw_load(p0, i5, 2, descr=double) - v13[f64|2] = vec_raw_load(p0, i7, 2, descr=double) + loop3 = self.parse_trace(""" + v10[f64|2] = vec_raw_load_f(p0, i1, 2, descr=double) + v11[f64|2] = vec_raw_load_f(p0, i3, 2, descr=double) + v12[f64|2] = vec_raw_load_f(p0, i5, 2, descr=double) + v13[f64|2] = vec_raw_load_f(p0, i7, 2, descr=double) v14[i32|2] = vec_cast_float_to_int(v10[f64|2]) v15[i32|2] = vec_cast_float_to_int(v11[f64|2]) v16[i32|2] = vec_cast_float_to_int(v12[f64|2]) @@ -322,7 +299,7 @@ self.assert_equal(loop2, loop3) def test_cast_float_to_single_float(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" f10 = raw_load(p0, i1, descr=double) f11 = raw_load(p0, i2, descr=double) f12 = raw_load(p0, i3, descr=double) @@ -342,9 +319,9 @@ pack2 = self.pack(loop1, 4, 8, F64, I32_2) pack3 = self.pack(loop1, 8, 12, I32, None) loop2 = self.schedule(loop1, [pack1,pack2,pack3]) - loop3 = self.parse(""" - v44[f64|2] = vec_raw_load(p0, i1, 2, descr=double) - v45[f64|2] = vec_raw_load(p0, i3, 2, descr=double) + loop3 = self.parse_trace(""" + v44[f64|2] = vec_raw_load_f(p0, i1, 2, descr=double) + v45[f64|2] = vec_raw_load_f(p0, i3, 2, descr=double) v46[i32|2] = vec_cast_float_to_singlefloat(v44[f64|2]) v47[i32|2] = vec_cast_float_to_singlefloat(v45[f64|2]) v41[i32|4] = vec_int_pack(v46[i32|2], v47[i32|2], 2, 2) @@ -353,7 +330,7 @@ self.assert_equal(loop2, loop3) def test_all(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = raw_load(p0, i1, descr=long) i11 = raw_load(p0, i2, descr=long) # @@ -367,9 +344,9 @@ pack2 = self.pack(loop1, 2, 4, I64, I64) pack3 = self.pack(loop1, 4, 6, I64, None) loop2 = self.schedule(loop1, [pack1,pack2,pack3], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v9[i64|2] = vec_int_expand(255,2) - v10[i64|2] = vec_raw_load(p0, i1, 2, descr=long) + v10[i64|2] = vec_raw_load_i(p0, i1, 2, descr=long) v11[i64|2] = vec_int_and(v10[i64|2], v9[i64|2]) guard_true(v11[i64|2]) [] """, False) @@ -377,7 +354,7 @@ def test_split_load_store(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = raw_load(p0, i1, descr=float) i11 = raw_load(p0, i2, descr=float) i12 = raw_load(p0, i3, descr=float) @@ -388,8 +365,8 @@ pack1 = self.pack(loop1, 0, 4, None, I32) pack2 = self.pack(loop1, 4, 6, I32_2, None) loop2 = self.schedule(loop1, [pack1,pack2], prepend_invariant=True) - loop3 = self.parse(""" - v1[i32|4] = vec_raw_load(p0, i1, 4, descr=float) + loop3 = self.parse_trace(""" + v1[i32|4] = vec_raw_load_i(p0, i1, 4, descr=float) i10 = vec_int_unpack(v1[i32|4], 0, 1) raw_store(p0, i3, i10, descr=float) i11 = vec_int_unpack(v1[i32|4], 1, 1) @@ -400,13 +377,13 @@ self.assert_equal(loop2, loop3) def test_split_arith(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = int_and(255, i1) i11 = int_and(255, i1) """) pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v1[i64|2] = vec_int_expand(255,2) v2[i64|2] = vec_int_expand(i1,2) v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) @@ -414,13 +391,13 @@ self.assert_equal(loop2, loop3) def test_split_arith(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = int_and(255, i1) i11 = int_and(255, i1) """) pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v1[i64|2] = vec_int_expand(255, 2) v2[i64|2] = vec_int_expand(i1, 2) v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) @@ -428,7 +405,7 @@ self.assert_equal(loop2, loop3) def test_no_vec_impl(self): - loop1 = self.parse(""" + loop1 = self.parse_trace(""" i10 = int_and(255, i1) i11 = int_and(255, i2) i12 = uint_floordiv(i10,1) @@ -439,7 +416,7 @@ pack1 = self.pack(loop1, 0, 2, I64, I64) pack4 = self.pack(loop1, 4, 6, I64, I64) loop2 = self.schedule(loop1, [pack1,pack4], prepend_invariant=True) - loop3 = self.parse(""" + loop3 = self.parse_trace(""" v1[i64|2] = vec_int_expand(255,2) v2[i64|2] = vec_box(2) v3[i64|2] = vec_int_pack(v2[i64|2], i1, 0, 1) @@ -457,7 +434,7 @@ self.assert_equal(loop2, loop3) def test_split_cast(self): - trace = self.parse(""" + trace = self.parse_trace(""" f10 = cast_int_to_float(i1) f11 = cast_int_to_float(i2) f12 = cast_int_to_float(i3) @@ -470,7 +447,7 @@ assert len(packs) == 2 def test_combine_packset_nearly_empty_pack(self): - trace = self.parse(""" + trace = self.parse_trace(""" i10 = int_add(i1, i3) i11 = int_add(i2, i3) """) diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py @@ -11,9 +11,9 @@ import rpython.jit.metainterp.optimizeopt.optimizer as optimizeopt import rpython.jit.metainterp.optimizeopt.virtualize as virtualize from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph -from rpython.jit.metainterp.optimizeopt.vectorize import (VectorizingOptimizer, MemoryRef, +from rpython.jit.metainterp.optimizeopt.vector import (VectorizingOptimizer, MemoryRef, isomorphic, Pair, NotAVectorizeableLoop, NotAProfitableLoop, GuardStrengthenOpt, - CostModel) + CostModel, VectorLoop) from rpython.jit.metainterp.optimize import InvalidLoop from rpython.jit.metainterp import compile from rpython.jit.metainterp.resoperation import rop, ResOperation @@ -41,25 +41,6 @@ jitdriver_sd = FakeJitDriverStaticData() - def parse_loop(self, ops, add_label=True): - loop = self.parse(ops, postprocess=self.postprocess) - token = JitCellToken() - pre = [] - tt = TargetToken(token) - if add_label: - pre = [ResOperation(rop.LABEL, loop.inputargs, None, descr=tt)] - else: - for i,op in enumerate(loop.operations): - if op.getopnum() == rop.LABEL: - op.setdescr(tt) - loop.operations = pre + filter(lambda op: op.getopnum() != rop.DEBUG_MERGE_POINT, loop.operations) - if loop.operations[-1].getopnum() == rop.JUMP: - loop.operations[-1].setdescr(token) - for op in loop.operations: - if op.getopnum() == rop.GUARD_EARLY_EXIT and op.getdescr() is None: - op.setdescr(compile.ResumeAtLoopHeaderDescr()) - return loop - def assert_vectorize(self, loop, expected_loop, call_pure_results=None): self._do_optimize_loop(loop, call_pure_results, export_state=True) self.assert_equal(loop, expected_loop) @@ -67,7 +48,7 @@ def vectoroptimizer(self, loop): metainterp_sd = FakeMetaInterpStaticData(self.cpu) jitdriver_sd = FakeJitDriverStaticData() - opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 0) + opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, 0) label_index = loop.find_first_index(rop.LABEL) opt.orig_label_args = loop.operations[label_index].getarglist()[:] return opt @@ -89,48 +70,48 @@ guard.setdescr(compile.ResumeAtLoopHeaderDescr()) loop.operations.insert(idx+1, guard) self.show_dot_graph(DependencyGraph(opt.loop), "original_" + self.test_name) - opt.analyse_index_calculations() - if opt.dependency_graph is not None: + graph = opt.analyse_index_calculations() + if graph is not None: cycle = opt.dependency_graph.cycles() if cycle is not None: print "CYCLE found %s" % cycle self.show_dot_graph(opt.dependency_graph, "early_exit_" + self.test_name) assert cycle is None - opt.schedule(False) + loop.operations = opt.schedule(False) opt.unroll_loop_iterations(loop, unroll_factor) opt.loop.operations = opt.get_newoperations() self.debug_print_operations(opt.loop) opt.clear_newoperations() - opt.dependency_graph = DependencyGraph(loop) - self.last_graph = opt.dependency_graph + graph = DependencyGraph(loop) + self.last_graph = graph self.show_dot_graph(self.last_graph, self.test_name) - return opt + return opt, graph def init_packset(self, loop, unroll_factor = -1): - opt = self.vectoroptimizer_unrolled(loop, unroll_factor) - opt.find_adjacent_memory_refs() + opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) + opt.find_adjacent_memory_refs(graph) return opt def extend_packset(self, loop, unroll_factor = -1): - opt = self.vectoroptimizer_unrolled(loop, unroll_factor) - opt.find_adjacent_memory_refs() + opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) + opt.find_adjacent_memory_refs(graph) opt.extend_packset() return opt def combine_packset(self, loop, unroll_factor = -1): - opt = self.vectoroptimizer_unrolled(loop, unroll_factor) - opt.find_adjacent_memory_refs() + opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) + opt.find_adjacent_memory_refs(graph) opt.extend_packset() opt.combine_packset() return opt def schedule(self, loop, unroll_factor = -1, with_guard_opt=False): - opt = self.vectoroptimizer_unrolled(loop, unroll_factor) + opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) opt.costmodel = FakeCostModel() - opt.find_adjacent_memory_refs() + opt.find_adjacent_memory_refs(graph) opt.extend_packset() opt.combine_packset() - opt.schedule(True) + opt.schedule(graph, True) if with_guard_opt: gso = GuardStrengthenOpt(opt.dependency_graph.index_vars, opt.has_two_labels) gso.propagate_all_forward(opt.loop) @@ -204,8 +185,7 @@ class BaseTestVectorize(VecTestHelper): - def test_vectorize_skip_impossible_1(self): - """ this trace does not contain a raw load / raw store from an array """ + def test_vectorize_skip(self): ops = """ [p0,i0] i1 = int_add(i0,1) diff --git a/rpython/jit/metainterp/optimizeopt/vector.py b/rpython/jit/metainterp/optimizeopt/vector.py --- a/rpython/jit/metainterp/optimizeopt/vector.py +++ b/rpython/jit/metainterp/optimizeopt/vector.py @@ -20,7 +20,7 @@ from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, MemoryRef, Node, IndexVar) from rpython.jit.metainterp.optimizeopt.version import LoopVersionInfo -from rpython.jit.metainterp.optimizeopt.schedule import (VecScheduleData, +from rpython.jit.metainterp.optimizeopt.schedule import (VecScheduleState, Scheduler, Pack, Pair, AccumPair, vectorbox_outof_box, getpackopnum, getunpackopnum, PackType, determine_input_output_types) from rpython.jit.metainterp.optimizeopt.guard import GuardStrengthenOpt @@ -31,9 +31,10 @@ from rpython.rlib.jit import Counters from rpython.rtyper.lltypesystem import lltype, rffi -class TraceLoop(object): +class VectorLoop(object): def __init__(self, label, oplist, jump): self.label = label + self.inputargs = label.getarglist() self.prefix = [] self.prefix_label = None assert self.label.getopnum() == rop.LABEL @@ -41,7 +42,7 @@ self.jump = jump assert self.jump.getopnum() == rop.JUMP - def all_operations(self): + def operation_list(self): return [self.label] + self.operations + [self.jump] def optimize_vector(metainterp_sd, jitdriver_sd, warmstate, loop_info, loop_ops): @@ -52,10 +53,10 @@ # the original loop (output of optimize_unroll) info = LoopVersionInfo(loop_info) version = info.snapshot(loop_ops, info.label_op) - loop = TraceLoop(loop_info.label_op, loop_ops[:-1], loop_ops[-1]) + loop = VectorLoop(loop_info.label_op, loop_ops[:-1], loop_ops[-1]) try: debug_start("vec-opt-loop") - metainterp_sd.logger_noopt.log_loop([], loop.all_operations(), -2, None, None, "pre vectorize") + metainterp_sd.logger_noopt.log_loop([], loop.operation_list(), -2, None, None, "pre vectorize") metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY) # start = time.clock() @@ -67,7 +68,7 @@ end = time.clock() # metainterp_sd.profiler.count(Counters.OPT_VECTORIZED) - metainterp_sd.logger_noopt.log_loop([], loop.all_operations(), -2, None, None, "post vectorize") + metainterp_sd.logger_noopt.log_loop([], loop.operation_list(), -2, None, None, "post vectorize") # nano = int((end-start)*10.0**9) debug_print("# vecopt factor: %d opcount: (%d -> %d) took %dns" % \ @@ -142,8 +143,7 @@ def __init__(self, metainterp_sd, jitdriver_sd, cost_threshold): Optimizer.__init__(self, metainterp_sd, jitdriver_sd) self.cpu = metainterp_sd.cpu - self.costmodel = X86_CostModel(cost_threshold, self.cpu.vector_register_size) - self.dependency_graph = None + self.cost_threshold = cost_threshold self.packset = None self.unroll_count = 0 self.smallest_type_bytes = 0 @@ -171,9 +171,10 @@ raise NotAVectorizeableLoop() # find index guards and move to the earliest position - self.analyse_index_calculations(loop) - if self.dependency_graph is not None: - self.schedule(False) # reorder the trace + graph = self.analyse_index_calculations(loop) + if graph is not None: + state = SchedulerState(graph) + self.schedule(state) # reorder the trace # unroll self.unroll_count = self.get_unroll_count(vsize) @@ -182,13 +183,15 @@ self.clear_newoperations(); # vectorize - self.dependency_graph = DependencyGraph(self.loop) + graph = DependencyGraph(loop) self.find_adjacent_memory_refs() self.extend_packset() self.combine_packset() - self.costmodel.reset_savings() - self.schedule(True) - if not self.costmodel.profitable(): + # TODO move cost model to CPU + costmodel = X86_CostModel(self.cpu, self.cost_threshold) + state = VecScheduleState(graph, self.packset, self.cpu, costmodel) + self.schedule(state) + if not state.profitable(): raise NotAProfitableLoop() def emit_unrolled_operation(self, op): @@ -308,7 +311,7 @@ unroll_count = simd_vec_reg_bytes // byte_count return unroll_count-1 # it is already unrolled once - def find_adjacent_memory_refs(self): + def find_adjacent_memory_refs(self, graph): """ The pre pass already builds a hash of memory references and the operations. Since it is in SSA form there are no array indices. If there are two array accesses in the unrolled loop @@ -320,7 +323,6 @@ operations = loop.operations self.packset = PackSet(self.cpu.vector_register_size) - graph = self.dependency_graph memory_refs = graph.memory_refs.items() # initialize the pack set for node_a,memref_a in memory_refs: @@ -447,59 +449,22 @@ if fail: assert False - def schedule(self, vector=False, sched_data=None): + def schedule(self, state): # TODO vector=False, sched_data=None): """ Scheduling the trace and emitting vector operations for packed instructions. """ - - self.clear_newoperations() - if sched_data is None: - sched_data = VecScheduleData(self.cpu.vector_register_size, - self.costmodel, self.orig_label_args) - self.dependency_graph.prepare_for_scheduling() - scheduler = Scheduler(self.dependency_graph, sched_data) - renamer = Renamer() - # - if vector: - self.packset.accumulate_prepare(sched_data, renamer) - # - for node in scheduler.schedulable_nodes: - op = node.getoperation() - if op.is_label(): - seen = sched_data.seen - for arg in op.getarglist(): - sched_data.seen[arg] = None - break - # - scheduler.emit_into(self._newoperations, renamer, unpack=vector) + state.prepare() + scheduler = Scheduler() + scheduler.walk_and_emit(state) # if not we_are_translated(): - for node in self.dependency_graph.nodes: + for node in graph.nodes: assert node.emitted - if vector and not self.costmodel.profitable(): + # + if state.profitable(): return - if vector: - # add accumulation info to the descriptor - for version in self.loop.versions: - # this needs to be done for renamed (accum arguments) - version.renamed_inputargs = [ renamer.rename_map.get(arg,arg) for arg in version.inputargs ] - self.appended_arg_count = len(sched_data.invariant_vector_vars) - #for guard_node in self.dependency_graph.guards: - # op = guard_node.getoperation() - # failargs = op.getfailargs() - # for i,arg in enumerate(failargs): - # if arg is None: - # continue - # accum = arg.getaccum() - # if accum: - # pass - # #accum.save_to_descr(op.getdescr(),i) - self.has_two_labels = len(sched_data.invariant_oplist) > 0 - self.loop.operations = self.prepend_invariant_operations(sched_data) - else: - self.loop.operations = self._newoperations - - self.clear_newoperations() + # + state.post_schedule() def prepend_invariant_operations(self, sched_data): """ Add invariant operations to the trace loop. returns the operation list @@ -540,7 +505,7 @@ that guards fail 'early' and relax dependencies. Without this step vectorization would not be possible! """ - self.dependency_graph = graph = DependencyGraph(loop) + graph = DependencyGraph(loop) ee_guard_node = graph.getnode(0) if ee_guard_node.getopnum() != rop.GUARD_EARLY_EXIT: raise NotAVectorizeableLoop() @@ -618,9 +583,9 @@ The main reaons to have this is of frequent unpack instructions, and the missing ability (by design) to detect not vectorizable loops. """ - def __init__(self, threshold, vec_reg_size): + def __init__(self, cpu, threshold): self.threshold = threshold - self.vec_reg_size = vec_reg_size + self.vec_reg_size = cpu.vector_register_size self.savings = 0 def reset_savings(self): @@ -850,11 +815,12 @@ # return None, -1 - def accumulate_prepare(self, sched_data, renamer): - vec_reg_size = sched_data.vec_reg_size + def accumulate_prepare(self, state): + vec_reg_size = state.vec_reg_size for pack in self.packs: if not pack.is_accumulating(): continue + xxx accum = pack.accum # create a new vector box for the parameters box = pack.input_type.new_vector_box() @@ -862,27 +828,27 @@ # reset the box to zeros or ones if accum.operator == Accum.PLUS: op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box) - sched_data.invariant_oplist.append(op) + state.invariant_oplist.append(op) result = box.clonebox() op = ResOperation(rop.VEC_INT_XOR, [box, box], result) - sched_data.invariant_oplist.append(op) + state.invariant_oplist.append(op) box = result elif accum.operator == Accum.MULTIPLY: # multiply is only supported by floats op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstFloat(1.0), ConstInt(size)], box) - sched_data.invariant_oplist.append(op) + state.invariant_oplist.append(op) else: - raise NotImplementedError("can only handle + and *") + raise NotImplementedError("can only handle %s" % accum.operator) result = box.clonebox() assert isinstance(result, BoxVector) result.accum = accum # pack the scalar value op = ResOperation(getpackopnum(box.gettype()), [box, accum.var, ConstInt(0), ConstInt(1)], result) - sched_data.invariant_oplist.append(op) + state.invariant_oplist.append(op) # rename the variable with the box - sched_data.setvector_of_box(accum.getoriginalbox(), 0, result) # prevent it from expansion - renamer.start_renaming(accum.getoriginalbox(), result) + state.setvector_of_box(accum.getoriginalbox(), 0, result) # prevent it from expansion + state.renamer.start_renaming(accum.getoriginalbox(), result) def split_overloaded_packs(self): newpacks = [] diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py --- a/rpython/jit/metainterp/resoperation.py +++ b/rpython/jit/metainterp/resoperation.py @@ -66,6 +66,9 @@ def is_inputarg(self): return False + def returns_vector(self): + return False + def ResOperation(opnum, args, descr=None): cls = opclasses[opnum] op = cls() @@ -88,6 +91,7 @@ return self._forwarded + class AbstractResOp(AbstractResOpOrInputArg): """The central ResOperation class, representing one operation.""" @@ -101,8 +105,7 @@ type = 'v' boolreflex = -1 boolinverse = -1 - vector = -1 - casts = ('\x00', -1, '\x00', -1) + vector = -1 # -1 means, no vector equivalent, -2 it is a vector statement def getopnum(self): return self.opnum @@ -357,6 +360,12 @@ def is_label(self): return self.getopnum() == rop.LABEL + def returns_void(self): + return self.type == 'v' + + def returns_vector(self): + return self.type != 'v' and self.vector == -2 + # =================== # Top of the hierachy # =================== @@ -365,6 +374,9 @@ pass class CastResOp(AbstractResOp): + _attrs_ = ('casts') + casts = ('\x00', -1, '\x00', -1) + def casts_box(self): return True @@ -546,8 +558,6 @@ _attrs_ = ('item_type','item_count','item_size','item_signed','accum') _extended_display = False - type = 'V' - #def __init__(self, item_type=FLOAT, item_count=2, item_size=8, item_signed=False, accum=None): # assert item_type in (FLOAT, INT) # self.item_type = item_type @@ -651,6 +661,13 @@ def reset_value(self): self.setref_base(lltype.nullptr(llmemory.GCREF.TO)) +class InputArgVector(VectorOp, AbstractInputArg): + def __init__(self): + pass + + def returns_vector(self): + return True + # ============ # arity mixins # ============ @@ -1154,6 +1171,8 @@ mixins.append(RefOp) else: assert result_type == 'n' + if name.startswith('VEC'): + mixins.insert(1,VectorOp) cls_name = '%s_OP' % name bases = (get_base_class(tuple(mixins), baseclass),) @@ -1271,6 +1290,8 @@ cls.vector = _opvector[opnum] if name in _cast_ops: cls.casts = _cast_ops[name] + if name.startswith('VEC'): + cls.vector = -2 setup2() del _opboolinverse del _opboolreflex _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit