Author: Richard Plangger <planri...@gmail.com> Branch: ppc-vsx-support Changeset: r87028:88be11973c7a Date: 2016-09-12 12:04 +0200 http://bitbucket.org/pypy/pypy/changeset/88be11973c7a/
Log: fixed vecopt tests with the new delayed execution diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py --- a/rpython/jit/metainterp/optimizeopt/dependency.py +++ b/rpython/jit/metainterp/optimizeopt/dependency.py @@ -1057,7 +1057,7 @@ var = ResOperation(rop.INT_ADD, args) tolist.append(var) if self.constant < 0: - args = [var, ConstInt(self.constant)] + args = [var, ConstInt(-self.constant)] var = ResOperation(rop.INT_SUB, args) tolist.append(var) return tolist diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -55,18 +55,29 @@ op = node.getoperation() if op in needs_resolving: # either it is a normal operation, or we know that there is a linear combination + del needs_resolving[op] if op in indexvars: indexvar = indexvars[op] for operation in indexvar.get_operations(): - self.oplist.append(operation) + self.append_to_oplist(operation) last = operation + indexvars[last] = indexvar self.renamer.start_renaming(op, last) - del needs_resolving[op] else: - del needs_resolving[op] self.resolve_delayed(needs_resolving, delayed, op) - self.oplist.append(op) + self.append_to_oplist(op) + self.seen[op] = None + if len(delayed) > i: + del delayed[i] i -= 1 + # some times the recursive call can remove several items from delayed, + # thus we correct the index here + if len(delayed) <= i: + i = len(delayed)-1 + + def append_to_oplist(self, op): + self.renamer.rename(op) + self.oplist.append(op) def post_schedule(self): @@ -106,7 +117,79 @@ def try_emit_or_delay(self, node): # implement me in subclass. e.g. as in VecScheduleState - raise NotImplementedError + + if not node.is_imaginary() and node.is_pure(): + # this operation might never be emitted. only if it is really needed + self.delay_emit(node) + return + # emit a now! + self.pre_emit(node) + self.mark_emitted(node) + if not node.is_imaginary(): + op = node.getoperation() + self.seen[op] = None + self.append_to_oplist(op) + + def delay_emit(self, node): + """ it has been decided that the operation might be scheduled later """ + delayed = node.delayed or [] + delayed.append(node) + node.delayed = None + provides = node.provides() + if len(provides) == 0: + self.delayed.append(node) + else: + for to in node.provides(): + tnode = to.target_node() + self.delegate_delay(tnode, delayed[:]) + self.mark_emitted(node) + + def delegate_delay(self, node, delayed): + """ Chain up delays, this can reduce many more of the operations """ + if node.delayed is None: + node.delayed = delayed + else: + delayedlist = node.delayed + for d in delayed: + delayedlist.append(d) + + + def mark_emitted(state, node, unpack=True): + """ An operation has been emitted, adds new operations to the worklist + whenever their dependency count drops to zero. + Keeps worklist sorted (see priority) """ + worklist = state.worklist + provides = node.provides()[:] + for dep in provides: # COPY + target = dep.to + node.remove_edge_to(target) + if not target.emitted and target.depends_count() == 0: + # sorts them by priority + i = len(worklist)-1 + while i >= 0: + cur = worklist[i] + c = (cur.priority - target.priority) + if c < 0: # meaning itnode.priority < target.priority: + worklist.insert(i+1, target) + break + elif c == 0: + # if they have the same priority, sort them + # using the original position in the trace + if target.getindex() < cur.getindex(): + worklist.insert(i+1, target) + break + i -= 1 + else: + worklist.insert(0, target) + node.clear_dependencies() + node.emitted = True + if not node.is_imaginary(): + op = node.getoperation() + state.renamer.rename(op) + if unpack: + state.ensure_args_unpacked(op) + state.post_emit(node) + def delay(self, node): return False @@ -120,8 +203,27 @@ def post_emit(self, node): pass - def pre_emit(self, node): - pass + def pre_emit(self, node, pack_first=True): + delayed = node.delayed + if delayed: + # there are some nodes that have been delayed just for this operation + if pack_first: + op = node.getoperation() + self.resolve_delayed({}, delayed, op) + for node in delayed: + if node in self.seen: + continue + if node is not None: + provides = node.provides() + if len(provides) == 0: + # add this node to the final delay list + # might be emitted before jump! + self.delayed.append(node) + else: + for to in node.provides(): + tnode = to.target_node() + self.delegate_delay(tnode, [node]) + node.delayed = None class Scheduler(object): """ Create an instance of this class to (re)schedule a vector trace. """ @@ -412,7 +514,7 @@ assert vecinfo.count > argvecinfo.count def expand(state, pack, args, arg, index): - """ Expand a value into a vector box. useful for arith metic + """ Expand a value into a vector box. useful for arithmetic of one vector with a scalar (either constant/varialbe) """ left = pack.leftmost() @@ -547,23 +649,8 @@ failargs[i] = self.renamer.rename_map.get(seed, seed) op.setfailargs(failargs) - delayed = node.delayed - if delayed: - # there are some nodes that have been delayed just for this operation - if pack_first: - self.resolve_delayed({}, delayed, op) - for node in delayed: - if node is not None: - provides = node.provides() - if len(provides) == 0: - # add this node to the final delay list - # might be emitted before jumping! - self.delayed.append(node) - else: - for to in node.provides(): - tnode = to.target_node() - self.delegate_delay(tnode, [node]) - node.delayed = None + SchedulerState.pre_emit(self, node, pack_first) + def profitable(self): return self.costmodel.profitable() @@ -584,76 +671,7 @@ turn_into_vector(self, node.pack) return elif not node.emitted: - if not node.is_imaginary() and node.is_pure(): - # this operation might never be emitted. only if it is really needed - self.delay_emit(node) - return - # emit a now! - self.pre_emit(node) - self.mark_emitted(node) - if not node.is_imaginary(): - op = node.getoperation() - self.seen[op] = None - self.oplist.append(op) - - def delay_emit(self, node): - """ it has been decided that the operation might be scheduled later """ - delayed = node.delayed or [] - delayed.append(node) - node.delayed = None - provides = node.provides() - if len(provides) == 0: - self.delayed.append(node) - else: - for to in node.provides(): - tnode = to.target_node() - self.delegate_delay(tnode, delayed[:]) - self.mark_emitted(node) - - def delegate_delay(self, node, delayed): - """ Chain up delays, this can reduce many more of the operations """ - if node.delayed is None: - node.delayed = delayed - else: - delayedlist = node.delayed - for d in delayed: - delayedlist.append(d) - - def mark_emitted(state, node, unpack=True): - """ An operation has been emitted, adds new operations to the worklist - whenever their dependency count drops to zero. - Keeps worklist sorted (see priority) """ - worklist = state.worklist - provides = node.provides()[:] - for dep in provides: # COPY - target = dep.to - node.remove_edge_to(target) - if not target.emitted and target.depends_count() == 0: - # sorts them by priority - i = len(worklist)-1 - while i >= 0: - cur = worklist[i] - c = (cur.priority - target.priority) - if c < 0: # meaning itnode.priority < target.priority: - worklist.insert(i+1, target) - break - elif c == 0: - # if they have the same priority, sort them - # using the original position in the trace - if target.getindex() < cur.getindex(): - worklist.insert(i+1, target) - break - i -= 1 - else: - worklist.insert(0, target) - node.clear_dependencies() - node.emitted = True - if not node.is_imaginary(): - op = node.getoperation() - state.renamer.rename(op) - if unpack: - state.ensure_args_unpacked(op) - state.post_emit(node) + SchedulerState.try_emit_or_delay(self, node) def delay(self, node): if node.pack: @@ -706,7 +724,7 @@ self.renamer.start_renaming(arg, vecop) self.seen[vecop] = None self.costmodel.record_vector_unpack(var, pos, 1) - self.oplist.append(vecop) + self.append_to_oplist(vecop) return vecop return arg diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py @@ -271,7 +271,7 @@ def test_opcount_filling_guard(self): descr = ArrayDescr(0,4, None, 'S') - vec = ResOperation(rop.VEC_LOAD_I, ['a','i'], descr=descr) + vec = ResOperation(rop.VEC_LOAD_I, ['a','i', 8, 0], descr=descr) vec.count = 4 pack = Pack([Node(ResOperation(rop.GUARD_TRUE, [vec]), 0), Node(ResOperation(rop.GUARD_TRUE, [vec]), 1), @@ -319,7 +319,7 @@ self.vectorize(trace) self.debug_print_operations(trace) self.ensure_operations([ - 'v10[4xi32] = vec_load_i(p0,i0,descr=int32arraydescr)', + 'v10[4xi32] = vec_load_i(p0,i0,4,0,descr=int32arraydescr)', 'v11[4xi32] = vec_int_is_true(v10[4xi32])', 'i100 = vec_unpack_i(v11[4xi32], 0, 1)', 'vec_guard_true(v11[4xi32]) [i100]', @@ -384,9 +384,8 @@ """ opt = """ [p0,i0] - i1 = int_add(i0,1) + v3[2xi64] = vec_load_i(p0,i0,8,0,descr=arraydescr) i2 = int_add(i0,2) - v3[2xi64] = vec_load_i(p0,i0,descr=arraydescr) jump(p0,i2) """ loop = self.parse_loop(ops) @@ -799,13 +798,11 @@ for i in range(15): for j in range(15): try: + mref1 = graph.getmemref(i) + mref2 = graph.getmemref(j) if i-4 == j or i+4 == j: - mref1 = graph.getmemref(i) - mref2 = graph.getmemref(j) assert mref1.is_adjacent_to(mref2) else: - mref1 = graph.getmemref(i) - mref2 = graph.getmemref(j) assert not mref1.is_adjacent_to(mref2) except KeyError: pass @@ -846,11 +843,13 @@ loop = self.parse_loop(ops) vopt, graph = self.extend_packset(loop,1) assert len(graph.memory_refs) == 2 - self.assert_independent(graph, 5,10) - assert len(vopt.packset.packs) == 2 + self.assert_independent(graph, 3,7) + # the delayed scheduling strips away the vectorized addition, + # because it is never used + assert len(vopt.packset.packs) == 1 self.assert_packset_empty(vopt.packset, len(loop.operations), - [(5,10), (4,9)]) + [(4,8)]) def test_packset_extend_load_modify_store(self): ops = """ @@ -1009,11 +1008,11 @@ i1 = int_add(i0, {stride}) i11 = int_le(i1, 128) guard_true(i11) [p0,p1,p2,i1] - i12 = int_add(i1, {stride}) - v1 = vec_load{suffix}(p0, i0, descr={descr}arraydescr) - v2 = vec_load{suffix}(p1, i0, descr={descr}arraydescr) + v1 = vec_load{suffix}(p0, i0,8,0, descr={descr}arraydescr) + v2 = vec_load{suffix}(p1, i0,8,0, descr={descr}arraydescr) v3 = {op}(v1,v2) - vec_store(p2, i0, v3, descr={descr}arraydescr) + vec_store(p2, i0, v3,8,0, descr={descr}arraydescr) + i12 = int_add(i0, 2) jump(p0,p1,p2,i12) """.format(op='vec_'+op,descr=descr,stride=1,suffix=suffix) loop = self.parse_loop(ops) @@ -1038,15 +1037,14 @@ i11 = int_add(i0, 1) i12 = int_lt(i11, i1) guard_true(i12) [i0,i1,i2,i3,i4] - i6 = int_mul(i0, 8) - i13 = int_add(i11, 1) + i13 = int_add(i0, 2) i18 = int_lt(i13, i1) guard_true(i18) [i11,i1,i2,i3,i4] - i14 = int_mul(i11, 8) - v19[2xi64] = vec_load_i(i2, i6, descr=arraydescr) - v20[2xi64] = vec_load_i(i3, i6, descr=arraydescr) + i6 = int_mul(i0, 8) + v19[2xi64] = vec_load_i(i2, i6, 1, 0, descr=arraydescr) + v20[2xi64] = vec_load_i(i3, i6, 1, 0, descr=arraydescr) v21[2xi64] = vec_int_add(v19, v20) - vec_store(i4, i6, v21, descr=arraydescr) + vec_store(i4, i6, v21, 1, 0, descr=arraydescr) jump(i13, i1, i2, i3, i4) """ loop = self.parse_loop(ops) @@ -1076,7 +1074,7 @@ {dead_code} i500 = int_add(i0, 16) i501 = int_lt(i500, 102) - v10[16xi8] = vec_load_i(p0, i0, descr=chararraydescr) + v10[16xi8] = vec_load_i(p0, i0, 1, 0, descr=chararraydescr) jump(p0,i2) """.format(dead_code=dead_code) loop = self.parse_loop(ops) @@ -1105,6 +1103,7 @@ [p0,i0] i1 = getarrayitem_raw_i(p0, i0, descr=arraydescr) i4 = int_sub(i1, 42) + setarrayitem_raw(p0, i0, i4, descr=arraydescr) i3 = int_add(i0,1) i5 = int_lt(i3, 10) guard_true(i5) [p0, i0] @@ -1121,8 +1120,9 @@ guard_true(i3) [p0,i0] i4 = int_add(i0, 2) i5 = int_lt(i4, 10) - v1[2xf64] = vec_load_i(p0, i0, descr=arraydescr) + v1[2xf64] = vec_load_i(p0, i0, 8, 0, descr=arraydescr) v2[2xf64] = vec_int_sub(v1[2xf64], v3[2xf64]) + vec_store(p0, i0, v2[2xf64], 8, 0, descr=arraydescr) jump(p0,i2,v3[2xf64]) """ loop = self.parse_loop(ops) @@ -1134,6 +1134,7 @@ [p0,i0,f3] f1 = getarrayitem_raw_f(p0, i0, descr=floatarraydescr) f4 = float_add(f1, f3) + setarrayitem_raw(p0, i0, f4, descr=floatarraydescr) i3 = int_add(i0,1) i5 = int_lt(i3, 10) guard_true(i5) [p0, i0] @@ -1150,8 +1151,9 @@ guard_true(i3) [p0,i0,f3] i4 = int_add(i0, 2) i5 = int_lt(i4, 10) - v1[2xf64] = vec_load_f(p0, i0, descr=floatarraydescr) + v1[2xf64] = vec_load_f(p0, i0, 8, 0, descr=floatarraydescr) v2[2xf64] = vec_float_add(v1[2xf64], v3[2xf64]) + vec_store(p0, i0, v2[2xf64], 8, 0, descr=floatarraydescr) jump(p0,i2,f3,v3[2xf64]) """ loop = self.parse_loop(ops) @@ -1181,7 +1183,7 @@ guard_true(i2) [p0, i0, v2[2xf64]] i10 = int_add(i0, 16) i20 = int_lt(i10, 100) - v1[2xf64] = vec_load_f(p0, i0, descr=floatarraydescr) + v1[2xf64] = vec_load_f(p0, i0, 1, 0, descr=floatarraydescr) v3[2xf64] = vec_float_add(v2[2xf64], v1[2xf64]) jump(p0, i1, v3[2xf64]) """ @@ -1213,21 +1215,18 @@ i54 = int_add(i28, 2) i638 = int_ge(i54, i18) guard_false(i638) [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, i40, i41, p42, i43, i44, i21, i4, i0, i18] - i12 = int_add(i44, 8) - i56 = int_add(i41, 8) - i46 = int_add(i37, 8) i47 = int_add(i28, 2) i52 = int_ge(i47, i18) + v61[2xf64] = vec_load_f(i21, i44, 1, 0, descr=floatarraydescr) + v62[2xf64] = vec_load_f(i4, i41, 1, 0, descr=floatarraydescr) + v63[2xf64] = vec_float_add(v61, v62) + vec_store(i0, i37, v63, 1, 0, descr=floatarraydescr) i55 = int_add(i44, 16) i629 = int_add(i41, 16) i637 = int_add(i37, 16) - v61[2xf64] = vec_load_f(i21, i44, descr=floatarraydescr) - v62[2xf64] = vec_load_f(i4, i41, descr=floatarraydescr) - v63[2xf64] = vec_float_add(v61, v62) - vec_store(i0, i37, v63, descr=floatarraydescr) f100 = vec_unpack_f(v61, 1, 1) f101 = vec_unpack_f(v62, 1, 1) - jump(p36, i637, p9, i56, p14, f100, p12, p38, f101, p39, i40, i54, p42, i43, i55, i21, i4, i0, i18) + jump(p36, i637, p9, i629, p14, f100, p12, p38, f101, p39, i40, i54, p42, i43, i55, i21, i4, i0, i18) """) vopt = self.vectorize(trace) self.assert_equal(trace, trace_opt) @@ -1256,12 +1255,12 @@ i11 = int_ge(i6, 36) i7 = int_add(i1, 4) i14 = int_ge(i7, 36) - v17 = vec_load_f(p0, i1, descr=floatarraydescr) + v17 = vec_load_f(p0, i1, 8, 0, descr=floatarraydescr) v19 = vec_cast_float_to_singlefloat(v17) - v18 = vec_load_f(p0, i5, descr=floatarraydescr) + v18 = vec_load_f(p0, i5, 8, 0, descr=floatarraydescr) v20 = vec_cast_float_to_singlefloat(v18) v21 = vec_pack_i(v19, v20, 2, 2) - vec_store(p1, i1, v21, descr=float32arraydescr) + vec_store(p1, i1, v21, 4, 0, descr=float32arraydescr) jump(p0, p1, i50) """ loop = self.parse_loop(ops) @@ -1291,21 +1290,18 @@ i500 = int_add(i4, 16) i501 = int_lt(i500, 100) guard_true(i501) [p0, p1, p2, i0, i4] - i189 = int_add(i0, 4) i187 = int_add(i4, 8) i188 = int_lt(i187, 100) - i207 = int_add(i0, 8) i196 = int_add(i4, 12) i197 = int_lt(i196, 100) - i205 = int_add(i0, 12) i400 = int_add(i4, 16) i401= int_lt(i400, 100) - i402 = int_add(i0, 16) - v228[4xi32] = vec_load_i(p0, i0, descr=float32arraydescr) + v228[4xi32] = vec_load_i(p0, i0, 1, 0, descr=float32arraydescr) v229[2xf64] = vec_cast_singlefloat_to_float(v228) v230 = vec_unpack_i(v228, 2, 2) v231 = vec_cast_singlefloat_to_float(v230) - v232 = vec_load_i(p1, i189, descr=float32arraydescr) + i189 = int_add(i0, 4) + v232 = vec_load_i(p1, i189, 1, 0, descr=float32arraydescr) v233 = vec_cast_singlefloat_to_float(v232) v236 = vec_float_add(v229, v233) v238 = vec_cast_float_to_singlefloat(v236) @@ -1314,7 +1310,8 @@ v237 = vec_float_add(v231, v235) v239 = vec_cast_float_to_singlefloat(v237) v240 = vec_pack_i(v238, v239, 2, 2) - vec_store(p2, i4, v240, descr=float32arraydescr) + vec_store(p2, i4, v240, 1, 0, descr=float32arraydescr) + i207 = int_add(i0, 16) jump(p0, p1, p2, i207, i500) """) vopt = self.vectorize(trace) @@ -1354,7 +1351,7 @@ """) vopt = self.schedule(trace) self.ensure_operations([ - 'v10[2xf64] = vec_load_f(p0,i0,descr=floatarraydescr)', + 'v10[2xf64] = vec_load_f(p0,i0,8,0,descr=floatarraydescr)', 'v11[2xf64] = vec_float_mul(v10[2xf64], v9[2xf64])', 'v12[2xf64] = vec_float_eq(v11[2xf64], v11[2xf64])', 'i100 = vec_unpack_f(v12[4xi32], 0, 1)', _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit