Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r78376:ac77811327eb
Date: 2015-07-01 12:35 +0200
http://bitbucket.org/pypy/pypy/changeset/ac77811327eb/
Log: correctly emit reduction operation in a guard exit that compiles a
bridge (was missing before) added prod(...) as accumulator
diff --git a/pypy/module/micronumpy/compile.py
b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -37,19 +37,10 @@
class BadToken(Exception):
pass
-class FakeArguments(W_Root):
- def __init__(self, args_w, kw_w):
- self.args_w = args_w
- self.kw_w = kw_w
-
- def unpack(self):
- return self.args_w, self.kw_w
-
-
SINGLE_ARG_FUNCTIONS = ["sum", "prod", "max", "min", "all", "any",
"unegative", "flat", "tostring", "count_nonzero",
"argsort", "cumsum", "logical_xor_reduce"]
-TWO_ARG_FUNCTIONS = ["dot", 'multiply', 'take', 'searchsorted']
+TWO_ARG_FUNCTIONS = ["dot", 'take', 'searchsorted', 'multiply']
TWO_ARG_FUNCTIONS_OR_NONE = ['view', 'astype', 'reshape']
THREE_ARG_FUNCTIONS = ['where']
@@ -787,7 +778,7 @@
raise ArgumentNotAnArray
if self.name == "dot":
w_res = arr.descr_dot(interp.space, arg)
- if self.name == "multiply":
+ elif self.name == 'multiply':
w_res = arr.descr_mul(interp.space, arg)
elif self.name == 'take':
w_res = arr.descr_take(interp.space, arg)
@@ -808,7 +799,7 @@
if self.name == "where":
w_res = where(interp.space, arr, arg1, arg2)
else:
- assert False
+ assert False # unreachable code
elif self.name in TWO_ARG_FUNCTIONS_OR_NONE:
if len(self.args) != 2:
raise ArgumentMismatch
@@ -822,7 +813,7 @@
assert isinstance(w_arg, ArrayConstant)
w_res = arr.reshape(interp.space, w_arg.wrap(interp.space))
else:
- assert False, "missing two arg impl for: %s" % (self.name,)
+ assert False
else:
raise WrongFunctionName
if isinstance(w_res, W_NDimArray):
diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py
--- a/pypy/module/micronumpy/loop.py
+++ b/pypy/module/micronumpy/loop.py
@@ -421,8 +421,8 @@
lval = left_impl.getitem(i1).convert_to(space, dtype)
rval = right_impl.getitem(i2).convert_to(space, dtype)
oval = dtype.itemtype.add(oval, dtype.itemtype.mul(lval, rval))
- i1 += s1
- i2 += s2
+ i1 += jit.promote(s1)
+ i2 += jit.promote(s2)
outi.setitem(outs, oval)
outs = outi.next(outs)
rights = righti.next(rights)
diff --git a/pypy/module/micronumpy/test/test_zjit.py
b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -118,15 +118,18 @@
retval = self.interp.eval_graph(self.graph, [i])
return retval
- def define_matrix_dot():
+ def define_dot_matrix():
return """
mat = |16|
m = reshape(mat, [4,4])
+ vec = [0,1,2,3]
+ a = dot(m, vec)
+ a -> 3
"""
- def test_matrix_dot(self):
- result = self.run("matrix_dot")
- assert int(result) == 45
+ def test_dot_matrix(self):
+ result = self.run("dot_matrix")
+ assert int(result) == 86
self.check_vectorized(1, 1)
def define_float32_copy():
@@ -523,6 +526,7 @@
expected *= i * 2
assert result == expected
self.check_trace_count(1)
+ self.check_vectorized(1, 1)
def define_max():
return """
@@ -534,7 +538,7 @@
def test_max(self):
result = self.run("max")
assert result == 128
- self.check_vectorized(1, 0) # TODO reduce
+ self.check_vectorized(1, 0)
def define_min():
return """
@@ -546,7 +550,7 @@
def test_min(self):
result = self.run("min")
assert result == -128
- self.check_vectorized(1, 0) # TODO reduce
+ self.check_vectorized(1, 0)
def define_any():
return """
@@ -820,8 +824,8 @@
def test_dot(self):
result = self.run("dot")
assert result == 184
- self.check_trace_count(3)
- self.check_vectorized(3,0)
+ self.check_trace_count(5)
+ self.check_vectorized(3,1)
def define_argsort():
return """
diff --git a/rpython/jit/backend/llgraph/runner.py
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -880,9 +880,6 @@
if isinstance(box, BoxVectorAccum):
if box.operator == '+':
value = sum(value)
- elif box.operator == '-':
- def sub(acc, x): return acc - x
- value = reduce(sub, value, 0)
elif box.operator == '*':
def prod(acc, x): return acc * x
value = reduce(prod, value, 1)
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -559,6 +559,8 @@
self.current_clt.allgcrefs,
self.current_clt.frame_info)
self._check_frame_depth(self.mc, regalloc.get_gcmap())
+ #import pdb; pdb.set_trace()
+ self._accum_update_at_exit(arglocs, inputargs, faildescr, regalloc)
frame_depth_no_fixed_size = self._assemble(regalloc, inputargs,
operations)
codeendpos = self.mc.get_relative_pos()
self.write_pending_failure_recoveries(regalloc)
@@ -1865,7 +1867,7 @@
startpos = self.mc.get_relative_pos()
#
self._accum_update_at_exit(guardtok.fail_locs, guardtok.failargs,
- regalloc)
+ guardtok.faildescr, regalloc)
#
fail_descr, target = self.store_info_on_descr(startpos, guardtok)
self.mc.PUSH(imm(fail_descr))
@@ -2529,67 +2531,60 @@
# vector operations
# ________________________________________
- def _accum_update_at_exit(self, fail_locs, fail_args, regalloc):
+ def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
""" If accumulation is done in this loop, at the guard exit
some vector registers must be adjusted to yield the correct value"""
assert regalloc is not None
- for i,arg in enumerate(fail_args):
- if arg is None:
- continue
+ accum_info = faildescr.rd_accum_list
+ while accum_info:
+ pos = accum_info.position
+ loc = fail_locs[pos]
+ assert isinstance(loc, RegLoc)
+ arg = fail_args[pos]
if isinstance(arg, BoxVectorAccum):
- assert arg.scalar_var is not None
- loc = fail_locs[i]
- assert isinstance(loc, RegLoc)
- assert loc.is_xmm
- tgtloc = regalloc.force_allocate_reg(arg.scalar_var, fail_args)
- assert tgtloc is not None
- if arg.operator == '+':
- # reduction using plus
- self._accum_reduce_sum(arg, loc, tgtloc)
- fail_locs[i] = tgtloc
- regalloc.possibly_free_var(arg)
- fail_args[i] = arg.scalar_var
- else:
- raise NotImplementedError("accum operator %s not
implemented" %
- (arg.operator))
+ arg = arg.scalar_var
+ assert arg is not None
+ tgtloc = regalloc.force_allocate_reg(arg, fail_args)
+ if accum_info.operation == '+':
+ # reduction using plus
+ self._accum_reduce_sum(arg, loc, tgtloc)
+ elif accum_info.operation == '*':
+ self._accum_reduce_mul(arg, loc, tgtloc)
+ else:
+ import pdb; pdb.set_trace()
+ not_implemented("accum operator %s not implemented" %
+ (accum_info.operation))
+ fail_locs[pos] = tgtloc
+ regalloc.possibly_free_var(arg)
+ accum_info = accum_info.prev
- def _accum_reduce_sum(self, vector_var, accumloc, targetloc):
- assert isinstance(vector_var, BoxVectorAccum)
- #
- type = vector_var.gettype()
- size = vector_var.getsize()
- if type == FLOAT:
- if size == 8:
- # r = (r[0]+r[1],r[0]+r[1])
- self.mc.HADDPD(accumloc, accumloc)
- # upper bits (> 64) are dirty (but does not matter)
- if accumloc is not targetloc:
- self.mov(targetloc, accumloc)
- return
- if size == 4:
- # r = (r[0]+r[1],r[2]+r[3],r[0]+r[1],r[2]+r[3])
- self.mc.HADDPS(accumloc, accumloc)
- self.mc.HADDPS(accumloc, accumloc)
- # invoking it a second time will gather the whole sum
- # at the first element position
- # the upper bits (>32) are dirty (but does not matter)
- if accumloc is not targetloc:
- self.mov(targetloc, accumloc)
- return
- elif type == INT:
+ def _accum_reduce_mul(self, arg, accumloc, targetloc):
+ scratchloc = X86_64_SCRATCH_REG
+ self.mc.mov(scratchloc, accumloc)
+ # swap the two elements
+ self.mc.SHUFPS_xxi(scratchloc.value, scratchloc.value, 0x01)
+ self.mc.MULPD(accumloc, scratchloc)
+ if accumloc is not targetloc:
+ self.mc.mov(targetloc, accumloc)
+
+ def _accum_reduce_sum(self, arg, accumloc, targetloc):
+ # Currently the accumulator can ONLY be the biggest
+ # size for X86 -> 64 bit float/int
+ if arg.type == FLOAT:
+ # r = (r[0]+r[1],r[0]+r[1])
+ self.mc.HADDPD(accumloc, accumloc)
+ # upper bits (> 64) are dirty (but does not matter)
+ if accumloc is not targetloc:
+ self.mov(targetloc, accumloc)
+ return
+ elif arg.type == INT:
scratchloc = X86_64_SCRATCH_REG
- if size == 8:
- self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
- self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
- self.mc.ADD(targetloc, scratchloc)
- return
- if size == 4:
- self.mc.PHADDD(accumloc, accumloc)
- self.mc.PHADDD(accumloc, accumloc)
- self.mc.PEXTRD_rxi(targetloc.value, accumloc.value, 0)
- return
+ self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
+ self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
+ self.mc.ADD(targetloc, scratchloc)
+ return
- raise NotImplementedError("reduce sum for %s not impl." % vector_var)
+ not_implemented("reduce sum for %s not impl." % arg)
def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
# considers item scale (raw_load does not)
@@ -2655,7 +2650,7 @@
# There is no 64x64 bit packed mul and I did not find one
# for 8 bit either. It is questionable if it gives any benefit
# for 8 bit.
- raise NotImplementedError("")
+ not_implemented("int8/64 mul")
def genop_vec_int_add(self, op, arglocs, resloc):
loc0, loc1, size_loc = arglocs
@@ -2757,7 +2752,7 @@
# the speedup might only be modest...
# the optimization does not emit such code!
msg = "vec int signext (%d->%d)" % (size, tosize)
- raise NotImplementedError(msg)
+ not_implemented(msg)
def genop_vec_float_expand(self, op, arglocs, resloc):
srcloc, sizeloc = arglocs
diff --git a/rpython/jit/metainterp/compile.py
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -488,7 +488,8 @@
class ResumeGuardDescr(ResumeDescr):
_attrs_ = ('rd_numb', 'rd_count', 'rd_consts', 'rd_virtuals',
- 'rd_frame_info_list', 'rd_pendingfields', 'status')
+ 'rd_frame_info_list', 'rd_pendingfields', 'rd_accum_list',
+ 'status')
rd_numb = lltype.nullptr(NUMBERING)
rd_count = 0
@@ -496,6 +497,7 @@
rd_virtuals = None
rd_frame_info_list = None
rd_pendingfields = lltype.nullptr(PENDINGFIELDSP.TO)
+ rd_accum_list = None
status = r_uint(0)
@@ -507,6 +509,7 @@
self.rd_pendingfields = other.rd_pendingfields
self.rd_virtuals = other.rd_virtuals
self.rd_numb = other.rd_numb
+ self.rd_accum_list = other.rd_accum_list
# we don't copy status
ST_BUSY_FLAG = 0x01 # if set, busy tracing from the guard
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py
b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -416,6 +416,7 @@
if vbox.gettype() == INT:
return self.extend_int(vbox, newtype)
else:
+ import pdb; pdb.set_trace()
raise NotImplementedError("cannot yet extend float")
def extend_int(self, vbox, newtype):
@@ -856,8 +857,9 @@
class Accum(object):
PLUS = '+'
+ MULTIPLY = '*'
- def __init__(self, var=None, pos=-1, operator=PLUS):
+ def __init__(self, var, pos, operator):
self.var = var
self.pos = pos
self.operator = operator
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1369,6 +1369,22 @@
i32 = int_ge(i30, i25)
guard_false(i32, descr=<Guard0x7f9f03ab17d0>) [p0, i29, i30, i31, p19,
None, None, None]
jump(p0, p19, i30, i31, i29, i8, i25,
descr=TargetToken(140320937897104))
+
+ """
+ trace ="""
+ [i0, i1, i16, i17, i18, i5, p6, p7, p8, f19, p10, p11, p12, p13, p14,
p15, i20, i21]
+
guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr
object at 0x7f249eb2e510>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12, p11,
p10, p8, p7, p6, f19]
+ f22 = raw_load(i20, i18, descr=floatarraydescr)
+
guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated
object at 0x7f249eb2ec90>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12,
p11, p10, p8, p7, p6, f22, f19]
+ f23 = raw_load(i21, i17, descr=floatarraydescr)
+ f24 = float_mul(f22, f23)
+ f25 = float_add(f19, f24)
+ i26 = int_add(i18, 8)
+ i27 = int_add(i17, 8)
+ i28 = int_lt(i16, i5)
+ guard_true(i28,
descr=<rpython.jit.metainterp.compile.ResumeGuardTrueDescr object at
0x7f249eb99290>) [i5, i26, i27, i16, i1, i0, p15, p14, p13, p12, p11, p10, p8,
p7, p6, f25, None]
+ i31 = int_add(i16, 1)
+ jump(i0, i1, i31, i27, i26, i5, p6, p7, p8, f25, p10, p11, p12, p13,
p14, p15, i20, i21)
"""
# schedule 885 -> ptype is non for raw_load?
opt = self.vectorize(self.parse_loop(trace))
diff --git a/rpython/jit/metainterp/optimizeopt/util.py
b/rpython/jit/metainterp/optimizeopt/util.py
--- a/rpython/jit/metainterp/optimizeopt/util.py
+++ b/rpython/jit/metainterp/optimizeopt/util.py
@@ -8,7 +8,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.jit.metainterp import resoperation
from rpython.jit.metainterp.resoperation import rop
-from rpython.jit.metainterp.resume import Snapshot
+from rpython.jit.metainterp.resume import Snapshot, AccumInfo
# ____________________________________________________________
# Misc. utilities
@@ -213,6 +213,8 @@
return True
def rename_failargs(self, guard, clone=False):
+ from rpython.jit.metainterp.history import BoxVectorAccum
+ from rpython.jit.metainterp.compile import ResumeGuardDescr
if guard.getfailargs() is not None:
if clone:
args = guard.getfailargs()[:]
@@ -220,6 +222,11 @@
args = guard.getfailargs()
for i,arg in enumerate(args):
value = self.rename_map.get(arg,arg)
+ if value is not arg and isinstance(value, BoxVectorAccum):
+ descr = guard.getdescr()
+ assert isinstance(descr,ResumeGuardDescr)
+ ai = AccumInfo(descr.rd_accum_list, i, value.operator)
+ descr.rd_accum_list = ai
args[i] = value
return args
return None
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -767,7 +767,7 @@
lop = lnode.getoperation()
opnum = lop.getopnum()
- if opnum in (rop.FLOAT_ADD, rop.INT_ADD):
+ if opnum in (rop.FLOAT_ADD, rop.INT_ADD, rop.FLOAT_MUL):
roper = rnode.getoperation()
assert lop.numargs() == 2 and lop.result is not None
accum_var, accum_pos = self.getaccumulator_variable(lop, roper,
origin_pack)
@@ -802,7 +802,10 @@
# of leading/preceding signext/floatcast instructions needs to
be
# considered. => tree pattern matching problem.
return None
- accum = Accum(accum_var, accum_pos, Accum.PLUS)
+ operator = Accum.PLUS
+ if opnum == rop.FLOAT_ADD:
+ operator = Accum.MULTIPLY
+ accum = Accum(accum_var, accum_pos, operator)
return AccumPair(lnode, rnode, ptype, ptype, accum)
return None
@@ -824,14 +827,22 @@
# create a new vector box for the parameters
box = pack.input_type.new_vector_box()
size = vec_reg_size // pack.input_type.getsize()
- op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box)
- sched_data.invariant_oplist.append(op)
- result = box.clonebox()
- # clear the box to zero TODO might not be zero for every reduction?
- op = ResOperation(rop.VEC_INT_XOR, [box, box], result)
- sched_data.invariant_oplist.append(op)
- box = result
- result = BoxVectorAccum(box, accum.var, '+')
+ # reset the box to zeros or ones
+ if accum.operator == Accum.PLUS:
+ op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box)
+ sched_data.invariant_oplist.append(op)
+ result = box.clonebox()
+ op = ResOperation(rop.VEC_INT_XOR, [box, box], result)
+ sched_data.invariant_oplist.append(op)
+ box = result
+ elif accum.operator == Accum.MULTIPLY:
+ # multiply is only supported by floats
+ op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstInt(1)], box)
+ sched_data.invariant_oplist.append(op)
+ else:
+ import pdb; pdb.set_trace()
+ raise NotImplementedError
+ result = BoxVectorAccum(box, accum.var, accum.operator)
# pack the scalar value
op = ResOperation(getpackopnum(box.gettype()),
[box, accum.var, ConstInt(0), ConstInt(1)],
result)
diff --git a/rpython/jit/metainterp/resume.py b/rpython/jit/metainterp/resume.py
--- a/rpython/jit/metainterp/resume.py
+++ b/rpython/jit/metainterp/resume.py
@@ -34,6 +34,13 @@
self.jitcode = jitcode
self.pc = pc
+class AccumInfo(object):
+ __slots__ = ('prev', 'position', 'operation')
+ def __init__(self, prev, position, operation):
+ self.prev = prev
+ self.operation = operation
+ self.position = position
+
def _ensure_parent_resumedata(framestack, n):
target = framestack[n]
if n == 0:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit