Author: Richard Plangger <planri...@gmail.com>
Branch: vecopt-merge
Changeset: r79819:a7fd3e5a487c
Date: 2015-09-25 12:13 +0200
http://bitbucket.org/pypy/pypy/changeset/a7fd3e5a487c/

Log:    added more types to the bool reduction tests, assembler going
        forward, but still not correct results

diff --git a/rpython/jit/backend/llgraph/runner.py 
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -829,6 +829,12 @@
     bh_vec_int_eq = bh_vec_float_eq
     bh_vec_int_ne = bh_vec_float_ne
 
+    def bh_vec_int_is_true(self, vx, count):
+        return map(lambda x: bool(x), vx)
+
+    def bh_vec_int_is_false(self, vx, count):
+        return map(lambda x: not bool(x), vx)
+
     def bh_vec_int_xor(self, vx, vy, count):
         return [int(x) ^ int(y) for x,y in zip(vx,vy)]
 
@@ -1075,22 +1081,22 @@
     def execute_guard_early_exit(self, descr):
         pass
 
-    def _check_true(self, arg):
+    def _test_true(self, arg):
         if isinstance(arg, list):
             return all(arg)
         return arg
 
-    def _check_false(self, arg):
+    def _test_false(self, arg):
         if isinstance(arg, list):
             return any(arg)
         return arg
 
     def execute_guard_true(self, descr, arg):
-        if not self._check_true(arg):
+        if not self._test_true(arg):
             self.fail_guard(descr)
 
     def execute_guard_false(self, descr, arg):
-        if self._check_false(arg):
+        if self._test_false(arg):
             self.fail_guard(descr)
 
     def execute_guard_value(self, descr, arg1, arg2):
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -584,7 +584,6 @@
             name = "Bridge # %s" % (descr_number,)
             self.cpu.profile_agent.native_code_written(name,
                                                        rawstart, fullsize)
-        print "bridge pos", hex(startpos+rawstart), 
hex(rawstart+bridgestartpos), startpos
         return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos, 
rawstart+bridgestartpos)
 
     def stitch_bridge(self, faildescr, target):
@@ -639,7 +638,6 @@
         rawstart = self.materialize_loop(looptoken)
         # update the jump to the real trace
         self._patch_jump_for_descr(rawstart + offset, asminfo.rawstart)
-        print faildescr, "=>", hex(asminfo.rawstart)
         # update the guard to jump right to this custom piece of assembler
         self.patch_jump_for_descr(faildescr, rawstart)
 
@@ -1069,6 +1067,8 @@
         if result_loc is ebp:
             self.guard_success_cc = cond
         else:
+            if result_loc.is_xmm:
+                return
             rl = result_loc.lowest8bits()
             self.mc.SET_ir(cond, rl.value)
             self.mc.MOVZX8_rr(result_loc.value, rl.value)
@@ -1663,14 +1663,6 @@
 
     def genop_guard_guard_false(self, guard_op, guard_token, locs, resloc):
         self.guard_success_cc = rx86.invert_condition(self.guard_success_cc)
-        # TODO loc = locs[0]
-        #if isinstance(loc, RegLoc):
-        #    if loc.is_xmm:
-        #        self._guard_vector_false(guard_op, loc)
-        #        # XXX
-        #        self.implement_guard(guard_token, 'NZ')
-        #        return
-        #self.mc.TEST(loc, loc)
         self.implement_guard(guard_token)
     genop_guard_guard_isnull = genop_guard_guard_false
 
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -400,7 +400,8 @@
 
     def load_condition_into_cc(self, box):
         if self.assembler.guard_success_cc == rx86.cond_none:
-            self.assembler.test_location(self.loc(box))
+            if not box.is_vector():
+                self.assembler.test_location(self.loc(box))
             self.assembler.guard_success_cc = rx86.Conditions['NZ']
 
     def _consider_guard_cc(self, op):
diff --git a/rpython/jit/backend/x86/vector_ext.py 
b/rpython/jit/backend/x86/vector_ext.py
--- a/rpython/jit/backend/x86/vector_ext.py
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -14,6 +14,7 @@
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.rtyper.lltypesystem import lltype
+from rpython.jit.backend.x86 import rx86
 
 # duplicated for easy migration, def in assembler.py as well
 # DUP START
@@ -33,6 +34,35 @@
 class VectorAssemblerMixin(object):
     _mixin_ = True
 
+    def guard_vector(self, guard_op, loc, true):
+        arg = guard_op.getarg(0)
+        size = arg.bytesize
+        temp = X86_64_XMM_SCRATCH_REG
+        load = arg.bytesize * arg.count - self.cpu.vector_register_size
+        assert load <= 0
+        if true:
+            self.mc.PXOR(temp, temp)
+            # if the vector is not fully packed blend 1s
+            if load < 0:
+                self.mc.PCMPEQQ(temp, temp) # fill with ones
+                self._blend_unused_slots(loc, arg, temp)
+                # reset to zeros
+                self.mc.PXOR(temp, temp)
+
+            # cmp with zeros (in temp) creates ones at each slot where it is 
zero
+            self.mc.PCMPEQ(loc, temp, size)
+            # temp converted to ones
+            self.mc.PCMPEQQ(temp, temp)
+            # test if all slots are zero
+            self.mc.PTEST(loc, temp)
+        else:
+            # if the vector is not fully packed blend 1s
+            if load < 0:
+                temp = X86_64_XMM_SCRATCH_REG
+                self.mc.PXOR(temp, temp)
+                self._blend_unused_slots(loc, arg, temp)
+            self.mc.PTEST(loc, loc)
+
     def _blend_unused_slots(self, loc, arg, temp):
         select = 0
         bits_used = (arg.item_count * arg.item_size * 8)
@@ -42,38 +72,6 @@
             index += 1
         self.mc.PBLENDW_xxi(loc.value, temp.value, select)
 
-    def _guard_vector_true(self, guard_op, loc, zero=False):
-        return
-        arg = guard_op.getarg(0)
-        size = arg.bytesize
-        temp = X86_64_XMM_SCRATCH_REG
-        #
-        self.mc.PXOR(temp, temp)
-        # if the vector is not fully packed blend 1s
-        if not arg.fully_packed(self.cpu.vector_register_size):
-            self.mc.PCMPEQQ(temp, temp) # fill with ones
-            self._blend_unused_slots(loc, arg, temp)
-            # reset to zeros
-            self.mc.PXOR(temp, temp)
-
-        # cmp with zeros (in temp) creates ones at each slot where it is zero
-        self.mc.PCMPEQ(loc, temp, size)
-        # temp converted to ones
-        self.mc.PCMPEQQ(temp, temp)
-        # test if all slots are zero
-        self.mc.PTEST(loc, temp)
-
-    def _guard_vector_false(self, guard_op, loc):
-        arg = guard_op.getarg(0)
-        #
-        # if the vector is not fully packed blend 1s
-        if not arg.fully_packed(self.cpu.vector_register_size):
-            temp = X86_64_XMM_SCRATCH_REG
-            self.mc.PXOR(temp, temp)
-            self._blend_unused_slots(loc, arg, temp)
-        #
-        self.mc.PTEST(loc, loc)
-
     def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
         """ If accumulation is done in this loop, at the guard exit
             some vector registers must be adjusted to yield the correct value
@@ -203,26 +201,16 @@
         # a second time -> every zero entry (corresponding to non zero
         # entries before) become ones
         self.mc.PCMPEQ(loc, temp, sizeloc.value)
-        # TODO
-        #self.flush_cc(rx86.Conditions['NZ'], resloc)
-        #loc = locs[0]
-        #if isinstance(loc, RegLoc):
-        #    if loc.is_xmm:
-        #        self._guard_vector_true(guard_op, loc)
-        #        # XXX
-        #        self.implement_guard(guard_token, 'NZ')
-        #        return
-        #self.mc.TEST(loc, loc)
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
-
-    def genop_guard_vec_int_is_true(self, op, guard_op, guard_token, arglocs, 
resloc):
-        guard_opnum = guard_op.getopnum()
-        if guard_opnum == rop.GUARD_TRUE:
-            self._guard_vector_true(op, arglocs[0])
-            self.implement_guard(guard_token, 'NZ')
-        else:
-            self._guard_vector_false(op, arglocs[0])
-            self.implement_guard(guard_token, 'NZ')
+    #def genop_guard_vec_int_is_true(self, op, guard_op, guard_token, arglocs, 
resloc):
+    #    guard_opnum = guard_op.getopnum()
+    #    if guard_opnum == rop.GUARD_TRUE:
+    #        self._guard_vector_true(op, arglocs[0])
+    #        self.implement_guard(guard_token, 'NZ')
+    #    else:
+    #        self._guard_vector_false(op, arglocs[0])
+    #        self.implement_guard(guard_token, 'NZ')
 
     def genop_vec_int_mul(self, op, arglocs, resloc):
         loc0, loc1, itemsize_loc = arglocs
@@ -233,9 +221,7 @@
             self.mc.PMULLD(loc0, loc1)
         else:
             # NOTE see 
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
-            # There is no 64x64 bit packed mul and I did not find one
-            # for 8 bit either. It is questionable if it gives any benefit
-            # for 8 bit.
+            # There is no 64x64 bit packed mul. For 8 bit either. It is 
questionable if it gives any benefit?
             not_implemented("int8/64 mul")
 
     def genop_vec_int_add(self, op, arglocs, resloc):
@@ -311,6 +297,7 @@
             self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
         elif size == 8:
             self.mc.XORPD(src, heap(self.float_const_neg_addr))
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
     def genop_vec_float_eq(self, op, arglocs, resloc):
         _, rhsloc, sizeloc = arglocs
@@ -319,6 +306,7 @@
             self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 0) # 0 means equal
         else:
             self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 0)
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
     def genop_vec_float_ne(self, op, arglocs, resloc):
         _, rhsloc, sizeloc = arglocs
@@ -328,11 +316,13 @@
             self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 1 << 2)
         else:
             self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 1 << 2)
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
     def genop_vec_int_eq(self, op, arglocs, resloc):
         _, rhsloc, sizeloc = arglocs
         size = sizeloc.value
         self.mc.PCMPEQ(resloc, rhsloc, size)
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
     def genop_vec_int_ne(self, op, arglocs, resloc):
         _, rhsloc, sizeloc = arglocs
@@ -346,6 +336,7 @@
         # 11 11 11 11
         # ----------- pxor
         # 00 11 00 00
+        self.flush_cc(rx86.Conditions['NZ'], resloc)
 
     def gen_cmp(func):
         """ The requirement for func is that it must return one bits for each
@@ -362,10 +353,9 @@
                 self.mc.PCMPEQ(lhsloc, temp, size) # compare
                 self.mc.PCMPEQQ(temp, temp) # set all bits to 1
                 self.mc.PTEST(lhsloc, temp)
-                self.implement_guard(guard_token, 'NZ')
             else:
                 self.mc.PTEST(lhsloc, lhsloc)
-                self.implement_guard(guard_token, 'NZ')
+            self.flush_cc(x86.Conditions['NZ'], lhsloc)
         return generate_assembler
 
     genop_guard_vec_float_eq = gen_cmp(genop_vec_float_eq)
@@ -643,15 +633,13 @@
         result = self.xrm.force_result_in_reg(op, op.getarg(0), args)
         self.perform(op, [source, imm(lhs.bytesize)], result)
 
-    def consider_vec_float_eq(self, op, guard_op):
+    def consider_vec_float_eq(self, op):
         lhs = op.getarg(0)
         args = op.getarglist()
         lhsloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
         rhsloc = self.make_sure_var_in_reg(op.getarg(1), args)
-        if guard_op:
-            self.perform_with_guard(op, guard_op, [lhsloc, rhsloc, 
imm(lhs.bytesize)], None)
-        else:
-            self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
+        resloc = self.force_allocate_reg_or_cc(op)
+        self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
 
     consider_vec_float_ne = consider_vec_float_eq
     consider_vec_int_eq = consider_vec_float_eq
diff --git a/rpython/jit/metainterp/resoperation.py 
b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -195,7 +195,6 @@
     _attrs_ = ('_forwarded',)
     _forwarded = None # either another resop or OptInfo  
 
-
     def get_forwarded(self):
         return self._forwarded
 
@@ -1031,8 +1030,8 @@
     'VEC_FLOAT_NEG/1/f',
     'VEC_FLOAT_ABS/1/f',
     '_VEC_ARITHMETIC_LAST',
-    'VEC_FLOAT_EQ/2b/f',
-    'VEC_FLOAT_NE/2b/f',
+    'VEC_FLOAT_EQ/2b/i',
+    'VEC_FLOAT_NE/2b/i',
     'VEC_INT_IS_TRUE/1b/i',
     'VEC_INT_NE/2b/i',
     'VEC_INT_EQ/2b/i',
diff --git a/rpython/jit/metainterp/test/test_vector.py 
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -157,20 +157,31 @@
         res = self.meta_interp(f, [30])
         assert res == f(30) == 128
 
-    @py.test.mark.parametrize('func,init,insert,at,count,breaks',
+    @py.test.mark.parametrize('type,func,init,insert,at,count,breaks',
             # all
-           [(lambda x: not bool(x), 1.0, None, -1,32, False),
-            (lambda x: x == 0.0,    1.0, None, -1,33, False),
-            (lambda x: x == 0.0,    1.0, 0.0,  33,34, True),
+           [(rffi.DOUBLE, lambda x: not bool(x), 1.0, None, -1,32, False),
+            (rffi.DOUBLE, lambda x: x == 0.0,    1.0, None, -1,33, False),
+            (rffi.DOUBLE, lambda x: x == 0.0,    1.0, 0.0,  33,34, True),
+            (lltype.Signed, lambda x: not bool(x), 1, None, -1,32, False),
+            (lltype.Signed, lambda x: x == 0,      1, None, -1,33, False),
+            (lltype.Signed, lambda x: x == 0,      1, 0,  33,34, True),
             # any
-            (lambda x: x != 0.0,    0.0, 1.0,  33,35, True),
-            (lambda x: x != 0.0,    0.0, 1.0,  -1,36, False),
-            (lambda x: bool(x),     0.0, 1.0,  33,37, True),
-            (lambda x: bool(x),     0.0, 1.0,  -1,38, False),
+            (rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  33,35, True),
+            (rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  -1,36, False),
+            (rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  33,37, True),
+            (rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  -1,38, False),
+            (lltype.Signed, lambda x: x != 0,    0, 1,  33,35, True),
+            (lltype.Signed, lambda x: x != 0,    0, 1,  -1,36, False),
+            (lltype.Signed, lambda x: bool(x),   0, 1,  33,37, True),
+            (lltype.Signed, lambda x: bool(x),   0, 1,  -1,38, False),
+            (rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), 
rffi.r_int(1),  33,35, True),
+            (rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), 
rffi.r_int(1),  -1,36, False),
+            (rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), 
rffi.r_int(1),  33,37, True),
+            (rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), 
rffi.r_int(1),  -1,38, False),
            ])
-    def test_bool_reduction(self, func, init, insert, at, count, breaks):
+    def test_bool_reduction(self, type, func, init, insert, at, count, breaks):
         myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
-        T = lltype.Array(rffi.DOUBLE, hints={'nolength': True})
+        T = lltype.Array(type, hints={'nolength': True})
         def f(d):
             va = lltype.malloc(T, d, flavor='raw', zero=True)
             for i in range(d): va[i] = init
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to