Author: Richard Plangger <[email protected]>
Branch: vecopt-merge
Changeset: r79819:a7fd3e5a487c
Date: 2015-09-25 12:13 +0200
http://bitbucket.org/pypy/pypy/changeset/a7fd3e5a487c/
Log: added more types to the bool reduction tests, assembler going
forward, but still not correct results
diff --git a/rpython/jit/backend/llgraph/runner.py
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -829,6 +829,12 @@
bh_vec_int_eq = bh_vec_float_eq
bh_vec_int_ne = bh_vec_float_ne
+ def bh_vec_int_is_true(self, vx, count):
+ return map(lambda x: bool(x), vx)
+
+ def bh_vec_int_is_false(self, vx, count):
+ return map(lambda x: not bool(x), vx)
+
def bh_vec_int_xor(self, vx, vy, count):
return [int(x) ^ int(y) for x,y in zip(vx,vy)]
@@ -1075,22 +1081,22 @@
def execute_guard_early_exit(self, descr):
pass
- def _check_true(self, arg):
+ def _test_true(self, arg):
if isinstance(arg, list):
return all(arg)
return arg
- def _check_false(self, arg):
+ def _test_false(self, arg):
if isinstance(arg, list):
return any(arg)
return arg
def execute_guard_true(self, descr, arg):
- if not self._check_true(arg):
+ if not self._test_true(arg):
self.fail_guard(descr)
def execute_guard_false(self, descr, arg):
- if self._check_false(arg):
+ if self._test_false(arg):
self.fail_guard(descr)
def execute_guard_value(self, descr, arg1, arg2):
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -584,7 +584,6 @@
name = "Bridge # %s" % (descr_number,)
self.cpu.profile_agent.native_code_written(name,
rawstart, fullsize)
- print "bridge pos", hex(startpos+rawstart),
hex(rawstart+bridgestartpos), startpos
return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos,
rawstart+bridgestartpos)
def stitch_bridge(self, faildescr, target):
@@ -639,7 +638,6 @@
rawstart = self.materialize_loop(looptoken)
# update the jump to the real trace
self._patch_jump_for_descr(rawstart + offset, asminfo.rawstart)
- print faildescr, "=>", hex(asminfo.rawstart)
# update the guard to jump right to this custom piece of assembler
self.patch_jump_for_descr(faildescr, rawstart)
@@ -1069,6 +1067,8 @@
if result_loc is ebp:
self.guard_success_cc = cond
else:
+ if result_loc.is_xmm:
+ return
rl = result_loc.lowest8bits()
self.mc.SET_ir(cond, rl.value)
self.mc.MOVZX8_rr(result_loc.value, rl.value)
@@ -1663,14 +1663,6 @@
def genop_guard_guard_false(self, guard_op, guard_token, locs, resloc):
self.guard_success_cc = rx86.invert_condition(self.guard_success_cc)
- # TODO loc = locs[0]
- #if isinstance(loc, RegLoc):
- # if loc.is_xmm:
- # self._guard_vector_false(guard_op, loc)
- # # XXX
- # self.implement_guard(guard_token, 'NZ')
- # return
- #self.mc.TEST(loc, loc)
self.implement_guard(guard_token)
genop_guard_guard_isnull = genop_guard_guard_false
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -400,7 +400,8 @@
def load_condition_into_cc(self, box):
if self.assembler.guard_success_cc == rx86.cond_none:
- self.assembler.test_location(self.loc(box))
+ if not box.is_vector():
+ self.assembler.test_location(self.loc(box))
self.assembler.guard_success_cc = rx86.Conditions['NZ']
def _consider_guard_cc(self, op):
diff --git a/rpython/jit/backend/x86/vector_ext.py
b/rpython/jit/backend/x86/vector_ext.py
--- a/rpython/jit/backend/x86/vector_ext.py
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -14,6 +14,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.rtyper.lltypesystem.lloperation import llop
from rpython.rtyper.lltypesystem import lltype
+from rpython.jit.backend.x86 import rx86
# duplicated for easy migration, def in assembler.py as well
# DUP START
@@ -33,6 +34,35 @@
class VectorAssemblerMixin(object):
_mixin_ = True
+ def guard_vector(self, guard_op, loc, true):
+ arg = guard_op.getarg(0)
+ size = arg.bytesize
+ temp = X86_64_XMM_SCRATCH_REG
+ load = arg.bytesize * arg.count - self.cpu.vector_register_size
+ assert load <= 0
+ if true:
+ self.mc.PXOR(temp, temp)
+ # if the vector is not fully packed blend 1s
+ if load < 0:
+ self.mc.PCMPEQQ(temp, temp) # fill with ones
+ self._blend_unused_slots(loc, arg, temp)
+ # reset to zeros
+ self.mc.PXOR(temp, temp)
+
+ # cmp with zeros (in temp) creates ones at each slot where it is
zero
+ self.mc.PCMPEQ(loc, temp, size)
+ # temp converted to ones
+ self.mc.PCMPEQQ(temp, temp)
+ # test if all slots are zero
+ self.mc.PTEST(loc, temp)
+ else:
+ # if the vector is not fully packed blend 1s
+ if load < 0:
+ temp = X86_64_XMM_SCRATCH_REG
+ self.mc.PXOR(temp, temp)
+ self._blend_unused_slots(loc, arg, temp)
+ self.mc.PTEST(loc, loc)
+
def _blend_unused_slots(self, loc, arg, temp):
select = 0
bits_used = (arg.item_count * arg.item_size * 8)
@@ -42,38 +72,6 @@
index += 1
self.mc.PBLENDW_xxi(loc.value, temp.value, select)
- def _guard_vector_true(self, guard_op, loc, zero=False):
- return
- arg = guard_op.getarg(0)
- size = arg.bytesize
- temp = X86_64_XMM_SCRATCH_REG
- #
- self.mc.PXOR(temp, temp)
- # if the vector is not fully packed blend 1s
- if not arg.fully_packed(self.cpu.vector_register_size):
- self.mc.PCMPEQQ(temp, temp) # fill with ones
- self._blend_unused_slots(loc, arg, temp)
- # reset to zeros
- self.mc.PXOR(temp, temp)
-
- # cmp with zeros (in temp) creates ones at each slot where it is zero
- self.mc.PCMPEQ(loc, temp, size)
- # temp converted to ones
- self.mc.PCMPEQQ(temp, temp)
- # test if all slots are zero
- self.mc.PTEST(loc, temp)
-
- def _guard_vector_false(self, guard_op, loc):
- arg = guard_op.getarg(0)
- #
- # if the vector is not fully packed blend 1s
- if not arg.fully_packed(self.cpu.vector_register_size):
- temp = X86_64_XMM_SCRATCH_REG
- self.mc.PXOR(temp, temp)
- self._blend_unused_slots(loc, arg, temp)
- #
- self.mc.PTEST(loc, loc)
-
def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
""" If accumulation is done in this loop, at the guard exit
some vector registers must be adjusted to yield the correct value
@@ -203,26 +201,16 @@
# a second time -> every zero entry (corresponding to non zero
# entries before) become ones
self.mc.PCMPEQ(loc, temp, sizeloc.value)
- # TODO
- #self.flush_cc(rx86.Conditions['NZ'], resloc)
- #loc = locs[0]
- #if isinstance(loc, RegLoc):
- # if loc.is_xmm:
- # self._guard_vector_true(guard_op, loc)
- # # XXX
- # self.implement_guard(guard_token, 'NZ')
- # return
- #self.mc.TEST(loc, loc)
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
-
- def genop_guard_vec_int_is_true(self, op, guard_op, guard_token, arglocs,
resloc):
- guard_opnum = guard_op.getopnum()
- if guard_opnum == rop.GUARD_TRUE:
- self._guard_vector_true(op, arglocs[0])
- self.implement_guard(guard_token, 'NZ')
- else:
- self._guard_vector_false(op, arglocs[0])
- self.implement_guard(guard_token, 'NZ')
+ #def genop_guard_vec_int_is_true(self, op, guard_op, guard_token, arglocs,
resloc):
+ # guard_opnum = guard_op.getopnum()
+ # if guard_opnum == rop.GUARD_TRUE:
+ # self._guard_vector_true(op, arglocs[0])
+ # self.implement_guard(guard_token, 'NZ')
+ # else:
+ # self._guard_vector_false(op, arglocs[0])
+ # self.implement_guard(guard_token, 'NZ')
def genop_vec_int_mul(self, op, arglocs, resloc):
loc0, loc1, itemsize_loc = arglocs
@@ -233,9 +221,7 @@
self.mc.PMULLD(loc0, loc1)
else:
# NOTE see
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
- # There is no 64x64 bit packed mul and I did not find one
- # for 8 bit either. It is questionable if it gives any benefit
- # for 8 bit.
+ # There is no 64x64 bit packed mul. For 8 bit either. It is
questionable if it gives any benefit?
not_implemented("int8/64 mul")
def genop_vec_int_add(self, op, arglocs, resloc):
@@ -311,6 +297,7 @@
self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
elif size == 8:
self.mc.XORPD(src, heap(self.float_const_neg_addr))
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
def genop_vec_float_eq(self, op, arglocs, resloc):
_, rhsloc, sizeloc = arglocs
@@ -319,6 +306,7 @@
self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 0) # 0 means equal
else:
self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 0)
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
def genop_vec_float_ne(self, op, arglocs, resloc):
_, rhsloc, sizeloc = arglocs
@@ -328,11 +316,13 @@
self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 1 << 2)
else:
self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 1 << 2)
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
def genop_vec_int_eq(self, op, arglocs, resloc):
_, rhsloc, sizeloc = arglocs
size = sizeloc.value
self.mc.PCMPEQ(resloc, rhsloc, size)
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
def genop_vec_int_ne(self, op, arglocs, resloc):
_, rhsloc, sizeloc = arglocs
@@ -346,6 +336,7 @@
# 11 11 11 11
# ----------- pxor
# 00 11 00 00
+ self.flush_cc(rx86.Conditions['NZ'], resloc)
def gen_cmp(func):
""" The requirement for func is that it must return one bits for each
@@ -362,10 +353,9 @@
self.mc.PCMPEQ(lhsloc, temp, size) # compare
self.mc.PCMPEQQ(temp, temp) # set all bits to 1
self.mc.PTEST(lhsloc, temp)
- self.implement_guard(guard_token, 'NZ')
else:
self.mc.PTEST(lhsloc, lhsloc)
- self.implement_guard(guard_token, 'NZ')
+ self.flush_cc(x86.Conditions['NZ'], lhsloc)
return generate_assembler
genop_guard_vec_float_eq = gen_cmp(genop_vec_float_eq)
@@ -643,15 +633,13 @@
result = self.xrm.force_result_in_reg(op, op.getarg(0), args)
self.perform(op, [source, imm(lhs.bytesize)], result)
- def consider_vec_float_eq(self, op, guard_op):
+ def consider_vec_float_eq(self, op):
lhs = op.getarg(0)
args = op.getarglist()
lhsloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
rhsloc = self.make_sure_var_in_reg(op.getarg(1), args)
- if guard_op:
- self.perform_with_guard(op, guard_op, [lhsloc, rhsloc,
imm(lhs.bytesize)], None)
- else:
- self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
+ resloc = self.force_allocate_reg_or_cc(op)
+ self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
consider_vec_float_ne = consider_vec_float_eq
consider_vec_int_eq = consider_vec_float_eq
diff --git a/rpython/jit/metainterp/resoperation.py
b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -195,7 +195,6 @@
_attrs_ = ('_forwarded',)
_forwarded = None # either another resop or OptInfo
-
def get_forwarded(self):
return self._forwarded
@@ -1031,8 +1030,8 @@
'VEC_FLOAT_NEG/1/f',
'VEC_FLOAT_ABS/1/f',
'_VEC_ARITHMETIC_LAST',
- 'VEC_FLOAT_EQ/2b/f',
- 'VEC_FLOAT_NE/2b/f',
+ 'VEC_FLOAT_EQ/2b/i',
+ 'VEC_FLOAT_NE/2b/i',
'VEC_INT_IS_TRUE/1b/i',
'VEC_INT_NE/2b/i',
'VEC_INT_EQ/2b/i',
diff --git a/rpython/jit/metainterp/test/test_vector.py
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -157,20 +157,31 @@
res = self.meta_interp(f, [30])
assert res == f(30) == 128
- @py.test.mark.parametrize('func,init,insert,at,count,breaks',
+ @py.test.mark.parametrize('type,func,init,insert,at,count,breaks',
# all
- [(lambda x: not bool(x), 1.0, None, -1,32, False),
- (lambda x: x == 0.0, 1.0, None, -1,33, False),
- (lambda x: x == 0.0, 1.0, 0.0, 33,34, True),
+ [(rffi.DOUBLE, lambda x: not bool(x), 1.0, None, -1,32, False),
+ (rffi.DOUBLE, lambda x: x == 0.0, 1.0, None, -1,33, False),
+ (rffi.DOUBLE, lambda x: x == 0.0, 1.0, 0.0, 33,34, True),
+ (lltype.Signed, lambda x: not bool(x), 1, None, -1,32, False),
+ (lltype.Signed, lambda x: x == 0, 1, None, -1,33, False),
+ (lltype.Signed, lambda x: x == 0, 1, 0, 33,34, True),
# any
- (lambda x: x != 0.0, 0.0, 1.0, 33,35, True),
- (lambda x: x != 0.0, 0.0, 1.0, -1,36, False),
- (lambda x: bool(x), 0.0, 1.0, 33,37, True),
- (lambda x: bool(x), 0.0, 1.0, -1,38, False),
+ (rffi.DOUBLE, lambda x: x != 0.0, 0.0, 1.0, 33,35, True),
+ (rffi.DOUBLE, lambda x: x != 0.0, 0.0, 1.0, -1,36, False),
+ (rffi.DOUBLE, lambda x: bool(x), 0.0, 1.0, 33,37, True),
+ (rffi.DOUBLE, lambda x: bool(x), 0.0, 1.0, -1,38, False),
+ (lltype.Signed, lambda x: x != 0, 0, 1, 33,35, True),
+ (lltype.Signed, lambda x: x != 0, 0, 1, -1,36, False),
+ (lltype.Signed, lambda x: bool(x), 0, 1, 33,37, True),
+ (lltype.Signed, lambda x: bool(x), 0, 1, -1,38, False),
+ (rffi.INT, lambda x: intmask(x) != 0, rffi.r_int(0),
rffi.r_int(1), 33,35, True),
+ (rffi.INT, lambda x: intmask(x) != 0, rffi.r_int(0),
rffi.r_int(1), -1,36, False),
+ (rffi.INT, lambda x: bool(intmask(x)), rffi.r_int(0),
rffi.r_int(1), 33,37, True),
+ (rffi.INT, lambda x: bool(intmask(x)), rffi.r_int(0),
rffi.r_int(1), -1,38, False),
])
- def test_bool_reduction(self, func, init, insert, at, count, breaks):
+ def test_bool_reduction(self, type, func, init, insert, at, count, breaks):
myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
- T = lltype.Array(rffi.DOUBLE, hints={'nolength': True})
+ T = lltype.Array(type, hints={'nolength': True})
def f(d):
va = lltype.malloc(T, d, flavor='raw', zero=True)
for i in range(d): va[i] = init
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit