Author: Richard Plangger <r...@pasra.at>
Branch: vecopt
Changeset: r78378:88fff9cde657
Date: 2015-07-01 13:48 +0200
http://bitbucket.org/pypy/pypy/changeset/88fff9cde657/

Log:    resolving issues with * accumulation, there where some assembler
        routines i did not implement correctly (but where not invoked
        beforehand)

diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -559,7 +559,6 @@
                                              self.current_clt.allgcrefs,
                                              self.current_clt.frame_info)
         self._check_frame_depth(self.mc, regalloc.get_gcmap())
-        #import pdb; pdb.set_trace()
         self._accum_update_at_exit(arglocs, inputargs, faildescr, regalloc)
         frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, 
operations)
         codeendpos = self.mc.get_relative_pos()
@@ -2551,7 +2550,6 @@
             elif accum_info.operation == '*':
                 self._accum_reduce_mul(arg, loc, tgtloc)
             else:
-                import pdb; pdb.set_trace()
                 not_implemented("accum operator %s not implemented" %
                                             (accum_info.operation)) 
             fail_locs[pos] = tgtloc
@@ -2559,13 +2557,13 @@
             accum_info = accum_info.prev
 
     def _accum_reduce_mul(self, arg, accumloc, targetloc):
-        scratchloc = X86_64_SCRATCH_REG
-        self.mc.mov(scratchloc, accumloc)
+        scratchloc = X86_64_XMM_SCRATCH_REG
+        self.mov(accumloc, scratchloc)
         # swap the two elements
         self.mc.SHUFPS_xxi(scratchloc.value, scratchloc.value, 0x01)
         self.mc.MULPD(accumloc, scratchloc)
         if accumloc is not targetloc:
-            self.mc.mov(targetloc, accumloc)
+            self.mov(accumloc, targetloc)
 
     def _accum_reduce_sum(self, arg, accumloc, targetloc):
         # Currently the accumulator can ONLY be the biggest
@@ -2575,7 +2573,7 @@
             self.mc.HADDPD(accumloc, accumloc)
             # upper bits (> 64) are dirty (but does not matter)
             if accumloc is not targetloc:
-                self.mov(targetloc, accumloc)
+                self.mov(accumloc, targetloc)
             return
         elif arg.type == INT:
             scratchloc = X86_64_SCRATCH_REG
@@ -2757,7 +2755,9 @@
     def genop_vec_float_expand(self, op, arglocs, resloc):
         srcloc, sizeloc = arglocs
         size = sizeloc.value
-        if size == 4:
+        if isinstance(srcloc, ConstFloatLoc):
+            self.mov(srcloc, resloc)
+        elif size == 4:
             # the register allocator forces src to be the same as resloc
             # r = (s[0], s[0], r[0], r[0])
             # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
@@ -2864,7 +2864,7 @@
                         # if source is a normal register (unpack)
                         assert count == 1
                         assert si == 0
-                        self.mov(X86_64_XMM_SCRATCH_REG, srcloc)
+                        self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
                         src = X86_64_XMM_SCRATCH_REG.value
                     select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
                     self.mc.INSERTPS_xxi(resloc.value, src, select)
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -81,13 +81,11 @@
         rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = y
         return ConstFloatLoc(adr)
 
-    def expand_float(self, var, const):
-        assert isinstance(var, BoxVector)
-        if var.getsize() == 4:
+    def expand_float(self, size, const):
+        if size == 4:
             loc = self.expand_single_float(const)
         else:
             loc = self.expand_double_float(const)
-        self.reg_bindings[var] = loc
         return loc
 
     def expand_double_float(self, f):
@@ -1632,16 +1630,19 @@
     consider_vec_float_unpack = consider_vec_int_unpack
 
     def consider_vec_float_expand(self, op):
+        result = op.result
+        assert isinstance(result, BoxVector)
         arg = op.getarg(0)
+        args = op.getarglist()
         if isinstance(arg, Const):
-            resloc = self.xrm.expand_float(op.result, arg)
-            # TODO consider this
-            return
-        args = op.getarglist()
-        resloc = self.xrm.force_result_in_reg(op.result, arg, args)
-        assert isinstance(op.result, BoxVector)
+            resloc = self.xrm.force_allocate_reg(result)
+            srcloc = self.xrm.expand_float(result.getsize(), arg)
+        else:
+            resloc = self.xrm.force_result_in_reg(op.result, arg, args)
+            srcloc = resloc
+
         size = op.result.getsize()
-        self.perform(op, [resloc, imm(size)], resloc)
+        self.perform(op, [srcloc, imm(size)], resloc)
 
     def consider_vec_int_expand(self, op):
         arg = op.getarg(0)
diff --git a/rpython/jit/metainterp/compile.py 
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -488,8 +488,7 @@
 
 class ResumeGuardDescr(ResumeDescr):
     _attrs_ = ('rd_numb', 'rd_count', 'rd_consts', 'rd_virtuals',
-               'rd_frame_info_list', 'rd_pendingfields', 'rd_accum_list',
-               'status')
+               'rd_frame_info_list', 'rd_pendingfields', 'status')
     
     rd_numb = lltype.nullptr(NUMBERING)
     rd_count = 0
diff --git a/rpython/jit/metainterp/history.py 
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -156,7 +156,7 @@
     index = -1
     final_descr = False
 
-    _attrs_ = ('adr_jump_offset', 'rd_locs', 'rd_loop_token')
+    _attrs_ = ('adr_jump_offset', 'rd_locs', 'rd_loop_token', 'rd_accum_list')
 
     def handle_fail(self, deadframe, metainterp_sd, jitdriver_sd):
         raise NotImplementedError
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py 
b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -416,7 +416,6 @@
         if vbox.gettype() == INT:
             return self.extend_int(vbox, newtype)
         else:
-            import pdb; pdb.set_trace()
             raise NotImplementedError("cannot yet extend float")
 
     def extend_int(self, vbox, newtype):
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py 
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -803,7 +803,7 @@
                 # considered. => tree pattern matching problem.
                 return None
             operator = Accum.PLUS
-            if opnum == rop.FLOAT_ADD:
+            if opnum == rop.FLOAT_MUL:
                 operator = Accum.MULTIPLY
             accum = Accum(accum_var, accum_pos, operator)
             return AccumPair(lnode, rnode, ptype, ptype, accum)
@@ -837,11 +837,10 @@
                 box = result
             elif accum.operator == Accum.MULTIPLY:
                 # multiply is only supported by floats
-                op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstInt(1)], box)
+                op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstFloat(1.0)], box)
                 sched_data.invariant_oplist.append(op)
             else:
-                import pdb; pdb.set_trace()
-                raise NotImplementedError
+                raise NotImplementedError("can only handle + and *")
             result = BoxVectorAccum(box, accum.var, accum.operator)
             # pack the scalar value
             op = ResOperation(getpackopnum(box.gettype()),
diff --git a/rpython/jit/metainterp/resoperation.py 
b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -512,8 +512,8 @@
     'VEC_FLOAT_PACK/4',          # VEC_FLOAT_PACK(vX, var/const, index, 
item_count)
     'VEC_INT_UNPACK/3',          # iX|fX = VEC_INT_UNPACK(vX, index, 
item_count)
     'VEC_INT_PACK/4',            # VEC_INT_PACK(vX, var/const, index, 
item_count)
-    'VEC_FLOAT_EXPAND/1',        # vX = VEC_FLOAT_EXPAND(var/const, item_count)
-    'VEC_INT_EXPAND/1',          # vX = VEC_INT_EXPAND(var/const, item_count)
+    'VEC_FLOAT_EXPAND/1',        # vX = VEC_FLOAT_EXPAND(var/const)
+    'VEC_INT_EXPAND/1',          # vX = VEC_INT_EXPAND(var/const)
     'VEC_BOX/1',
     '_VEC_PURE_LAST',
     #
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to