[pypy-commit] pypy zarch-simd-support: more vector ops, int->float, comparison float

plan_rich Wed, 14 Sep 2016 07:20:43 -0700

Author: Richard Plangger <planri...@gmail.com>
Branch: zarch-simd-support
Changeset: r87107:4c3e946b5c05
Date: 2016-09-14 16:19 +0200
http://bitbucket.org/pypy/pypy/changeset/4c3e946b5c05/


Log:    more vector ops, int->float, comparison float

diff --git a/rpython/jit/backend/zarch/assembler.py 
b/rpython/jit/backend/zarch/assembler.py
--- a/rpython/jit/backend/zarch/assembler.py
+++ b/rpython/jit/backend/zarch/assembler.py
@@ -919,6 +919,10 @@
         return frame_depth
 
     def regalloc_mov(self, prev_loc, loc):
+        if prev_loc.is_vector_reg():
+            assert loc.is_vector_reg()
+            self.mc.VLR(loc, prev_loc)
+            return
         if prev_loc.is_imm():
             value = prev_loc.getint()
             # move immediate value to register
diff --git a/rpython/jit/backend/zarch/conditions.py 
b/rpython/jit/backend/zarch/conditions.py
--- a/rpython/jit/backend/zarch/conditions.py
+++ b/rpython/jit/backend/zarch/conditions.py
@@ -18,7 +18,7 @@
 # normal branch instructions
 FLOAT = ConditionLocation(0x10)
 
-EQ = ConditionLocation(0x8)
+VEQI = EQ = ConditionLocation(0x8)
 LT = ConditionLocation(0x4)
 GT = ConditionLocation(0x2)
 OF = ConditionLocation(0x1) # overflow
@@ -27,7 +27,7 @@
 FLE = ConditionLocation(EQ.value | LT.value)
 GE = ConditionLocation(EQ.value | GT.value | OF.value)
 FGE = ConditionLocation(EQ.value | GT.value)
-NE = ConditionLocation(LT.value | GT.value | OF.value)
+VNEI = NE = ConditionLocation(LT.value | GT.value | OF.value)
 NO = ConditionLocation(0xe) # NO overflow
 
 FGT = ConditionLocation(GT.value | OF.value)
diff --git a/rpython/jit/backend/zarch/instruction_builder.py 
b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -468,27 +468,27 @@
         byte = (v1 & BIT_MASK_4) << 4 | (idx & BIT_MASK_4)
         self.writechar(chr(byte))
         encode_base_displace(self, bid)
-        self.writechar(chr((mask & BIT_MASK_4 << 4) | (rbx & BIT_MASK_4)))
+        self.writechar(chr((mask & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
         self.writechar(opcode2)
     return encode_vrx
 
-def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v'):
+def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v,m,m,m'):
     @builder.arguments(argtypes)
-    def encode_vrr_a(self, v1, v2):
+    def encode_vrr_a(self, v1, v2, mask3=0, mask4=0, mask5=0):
         self.writechar(opcode1)
         rbx =  (v1 >= 16) << 3
         rbx |= (v2 >= 16) << 2
         byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
         self.writechar(chr(byte))
         self.writechar(chr(0))
-        self.writechar(chr(0))
-        self.writechar(chr(rbx & BIT_MASK_4))
+        self.writechar(chr((mask5 & BIT_MASK_4) << 4 | (mask4 & BIT_MASK_4)))
+        self.writechar(chr((mask3 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
         self.writechar(opcode2)
     return encode_vrr_a
 
-def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'):
+def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m,m'):
     @builder.arguments(argtypes)
-    def encode_vrr_c(self, v1, v2, v3, mask1=0, mask2=0):
+    def encode_vrr_c(self, v1, v2, v3, mask4=0, mask5=0, mask6=0):
         self.writechar(opcode1)
         rbx =  (v1 >= 16) << 3
         rbx |= (v2 >= 16) << 2
@@ -497,11 +497,40 @@
         self.writechar(chr(byte))
         byte = (v3 & BIT_MASK_4) << 4
         self.writechar(chr(byte))
-        self.writechar(chr(mask2 & BIT_MASK_4))
-        self.writechar(chr((mask1 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(chr((mask6 & BIT_MASK_4) << 4 | (mask5 & BIT_MASK_4)))
+        self.writechar(chr((mask4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
         self.writechar(opcode2)
     return encode_vrr_c
 
+def build_vrr_e(mnemonic, (opcode1,opcode2), argtypes='v,v,v,v,m,m'):
+    @builder.arguments(argtypes)
+    def encode_vrr_e(self, v1, v2, v3, v4, mask5=0, mask6=0):
+        self.writechar(opcode1)
+        rbx =  (v1 >= 16) << 3
+        rbx |= (v2 >= 16) << 2
+        rbx |= (v3 >= 16) << 1
+        rbx |= (v4 >= 16)
+        byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        byte = (v3 & BIT_MASK_4) << 4 | (mask6 & BIT_MASK_4) << 4
+        self.writechar(chr(byte))
+        self.writechar(chr((mask5 & BIT_MASK_4)))
+        self.writechar(chr((v4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrr_e
+
+def build_vri_a(mnemonic, (opcode1,opcode2), argtypes='v,i16,m'):
+    @builder.arguments(argtypes)
+    def encode_vri_a(self, v1, i2, mask3):
+        self.writechar(opcode1)
+        rbx =  (v1 >= 16) << 3
+        byte = (v1 & BIT_MASK_4) << 4
+        self.writechar(chr(byte))
+        self.write_i16(i2 & BIT_MASK_16)
+        self.writechar(chr((mask3 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vri_a
+
 
 def build_unpack_func(mnemonic, func):
     @always_inline
@@ -555,13 +584,12 @@
             return arg
     unpack_arg._annspecialcase_ = 'specialize:arg(1)'
     argtypes = func._arguments_[:]
-    #while len(argtypes) > 0 and argtypes[-1] == '-':
-    #    argtypes.pop()
     at = argtypes[0] if len(argtypes) >= 1 else '-'
     bt = argtypes[1] if len(argtypes) >= 2 else '-'
     ct = argtypes[2] if len(argtypes) >= 3 else '-'
     dt = argtypes[3] if len(argtypes) >= 4 else '-'
     et = argtypes[4] if len(argtypes) >= 5 else '-'
+    ft = argtypes[5] if len(argtypes) >= 6 else '-'
     def function0(self):
         return func(self)
     def function1(self, a):
@@ -601,6 +629,14 @@
         i = unpack_arg(d, dt)
         j = unpack_arg(e, et)
         return func(self, f, g, h, i, j)
+    def function6(self, a, b, c, d, e, f):
+        g = unpack_arg(a, at)
+        h = unpack_arg(b, bt)
+        i = unpack_arg(c, ct)
+        j = unpack_arg(d, dt)
+        k = unpack_arg(e, et)
+        l = unpack_arg(f, ft)
+        return func(self, g, h, i, j, k, l)
     if len(argtypes) == 0:
         function = function0
     elif len(argtypes) == 1:
@@ -622,6 +658,8 @@
             function = function4_last_default
     elif len(argtypes) == 5:
         function = function5
+    elif len(argtypes) == 6:
+        function = function6
     else:
         assert 0, "implement function for argtypes %s" % (argtypes,)
     function.__name__ = mnemonic
diff --git a/rpython/jit/backend/zarch/instructions.py 
b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -295,7 +295,7 @@
 
 vector_mnemonic_codes = {
     'VL':     ('vrx', ['\xE7','\x06'], 'v,bid'),
-    'VLR':    ('vrr_a', ['\xE7','\x56']),
+    'VLR':    ('vrr_a', ['\xE7','\x56'], 'v,v'),
 
     'VST':    ('vrx', ['\xE7','\x0E'], 'v,bid'),
 
@@ -316,6 +316,16 @@
     'VFM':   ('vrr_c', ['\xE7','\xE7']),
     'VFD':   ('vrr_c', ['\xE7','\xE5']),
 
+    # conversion
+    'VCDG':  ('vrr_a', ['\xE7','\xC3']),
+
+    # compare, sign, ...
+    'VFPSO': ('vrr_a', ['\xE7','\xCC']),
+    'VFCE':  ('vrr_c', ['\xE7','\xE8']),
+    'VSEL':  ('vrr_e', ['\xE7','\x8D'], 'v,v,v,v'),
+    'VPERM': ('vrr_e', ['\xE7','\x8C'], 'v,v,v,v'),
+    'VREPI': ('vri_a', ['\xE7','\x45']),
+
     # '': ('', ['','']),
 }
 
diff --git a/rpython/jit/backend/zarch/locations.py 
b/rpython/jit/backend/zarch/locations.py
--- a/rpython/jit/backend/zarch/locations.py
+++ b/rpython/jit/backend/zarch/locations.py
@@ -25,6 +25,9 @@
     def is_fp_reg(self):
         return False
 
+    def is_vector_reg(self):
+        return False
+
     def is_imm_float(self):
         return False
 
@@ -118,13 +121,10 @@
     def is_core_reg(self):
         return False
 
-    def is_fp_reg(self):
-        return True
-
     def as_key(self):            # 16 <= as_key <= 32
         return self.value + 32
 
-    def is_float(self):
+    def is_vector_reg(self):
         return True
 
 class ImmLocation(AssemblerLocation):
diff --git a/rpython/jit/backend/zarch/regalloc.py 
b/rpython/jit/backend/zarch/regalloc.py
--- a/rpython/jit/backend/zarch/regalloc.py
+++ b/rpython/jit/backend/zarch/regalloc.py
@@ -54,6 +54,12 @@
     def __repr__(self):
         return "<TempFloat at %s>" % (id(self),)
 
+class TempVector(TempVar):
+    type = 'V'
+
+    def __repr__(self):
+        return "<TempVector at %s>" % (id(self),)
+
 
 class FPRegisterManager(RegisterManager):
     all_regs              = r.MANAGED_FP_REGS
@@ -142,8 +148,7 @@
         return loc
 
     def get_scratch_reg(self, selected_reg=None):
-        # TODO
-        box = TempFloat()
+        box = TempVector()
         reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes, 
selected_reg=selected_reg)
         self.temp_boxes.append(box)
         return reg
diff --git a/rpython/jit/backend/zarch/vector_ext.py 
b/rpython/jit/backend/zarch/vector_ext.py
--- a/rpython/jit/backend/zarch/vector_ext.py
+++ b/rpython/jit/backend/zarch/vector_ext.py
@@ -15,6 +15,7 @@
 import rpython.jit.backend.zarch.registers as r
 import rpython.jit.backend.zarch.conditions as c
 import rpython.jit.backend.zarch.locations as l
+import rpython.jit.backend.zarch.masks as m
 from rpython.jit.backend.zarch.locations import imm
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -27,42 +28,22 @@
         llop.debug_print(lltype.Void, msg)
     raise NotImplementedError(msg)
 
-def flush_vec_cc(asm, regalloc, condition, size, result_loc):
+def flush_vec_cc(asm, regalloc, condition, size, resultloc):
     # After emitting an instruction that leaves a boolean result in
-    # a condition code (cc), call this.  In the common case, result_loc
+    # a condition code (cc), call this.  In the common case, resultloc
     # will be set to SPP by the regalloc, which in this case means
     # "propagate it between this operation and the next guard by keeping
-    # it in the cc".  In the uncommon case, result_loc is another
+    # it in the cc".  In the uncommon case, resultloc is another
     # register, and we emit a load from the cc into this register.
 
-    # Possibly invert the bit in the CR
-    #bit, invert = c.encoding[condition]
-    #assert 24 <= bit <= 27
-    #if invert == 12:
-    #    pass
-    #elif invert == 4:
-    #    asm.mc.crnor(bit, bit, bit)
-    #else:
-    #    assert 0
-    #assert asm.guard_success_cc == c.cond_none
-    ##
-    #if result_loc is r.SPP:
-    #    asm.guard_success_cc = condition
-    #else:
-    #    resval = result_loc.value
-    #    # either doubleword integer 1 (2x) or word integer 1 (4x)
-    #    ones = regalloc.vrm.get_scratch_reg(type=INT).value
-    #    zeros = regalloc.vrm.get_scratch_reg(type=INT).value
-    #    asm.mc.vxor(zeros, zeros, zeros)
-    #    if size == 4:
-    #        asm.mc.vspltisw(ones, 1)
-    #    else:
-    #        assert size == 8
-    #        tloc = regalloc.rm.get_scratch_reg()
-    #        asm.mc.load_imm(tloc, asm.VEC_DOUBLE_WORD_ONES)
-    #        asm.mc.lvx(ones, 0, tloc.value)
-    #    asm.mc.vsel(resval, zeros, ones, resval)
-    pass
+    if resultloc is r.SPP:
+        asm.guard_success_cc = condition
+    else:
+        ones = regalloc.vrm.get_scratch_reg()
+        zeros = regalloc.vrm.get_scratch_reg()
+        asm.mc.VX(zeros, zeros, zeros)
+        asm.mc.VREPI(ones, l.imm(1), l.itemsize_to_mask(size))
+        asm.mc.VSEL(resultloc, ones, zeros, resultloc)
 
 class ZSIMDVectorExt(VectorExt):
     def setup_once(self, asm):
@@ -119,7 +100,7 @@
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
         if itemsize == 8:
-            self.mc.VFA(resloc, loc0, loc1, 3, 0)
+            self.mc.VFA(resloc, loc0, loc1, 3, 0, 0)
             return
         not_implemented("vec_float_add of size %d" % itemsize)
 
@@ -127,7 +108,7 @@
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
         if itemsize == 8:
-            self.mc.VFS(resloc, loc0, loc1, 3, 0)
+            self.mc.VFS(resloc, loc0, loc1, 3, 0, 0)
             return
         not_implemented("vec_float_sub of size %d" % itemsize)
 
@@ -135,7 +116,7 @@
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
         if itemsize == 8:
-            self.mc.VFM(resloc, loc0, loc1, 3, 0)
+            self.mc.VFM(resloc, loc0, loc1, 3, 0, 0)
             return
         not_implemented("vec_float_mul of size %d" % itemsize)
 
@@ -143,7 +124,7 @@
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
         if itemsize == 8:
-            self.mc.VFD(resloc, loc0, loc1, 3, 0)
+            self.mc.VFD(resloc, loc0, loc1, 3, 0, 0)
             return
         not_implemented("vec_float_truediv of size %d" % itemsize)
 
@@ -168,22 +149,18 @@
     def emit_vec_float_abs(self, op, arglocs, regalloc):
         resloc, argloc, sizeloc = arglocs
         size = sizeloc.value
-        if size == 4:
-            self.mc.xvabssp(resloc.value, argloc.value)
-        elif size == 8:
-            self.mc.xvabsdp(resloc.value, argloc.value)
-        else:
-            not_implemented("float abs for size %d" % size)
+        if size == 8:
+            self.mc.VFPSO(resloc, argloc, 3, 0, 2)
+            return
+        not_implemented("vec_float_abs of size %d" % itemsize)
 
     def emit_vec_float_neg(self, op, arglocs, regalloc):
         resloc, argloc, sizeloc = arglocs
         size = sizeloc.value
-        if size == 4:
-            self.mc.xvnegsp(resloc.value, argloc.value)
-        elif size == 8:
-            self.mc.xvnegdp(resloc.value, argloc.value)
-        else:
-            not_implemented("float neg for size %d" % size)
+        if size == 8:
+            self.mc.VFPSO(resloc, argloc, 3, 0, 0)
+            return
+        not_implemented("vec_float_abs of size %d" % itemsize)
 
     def emit_vec_guard_true(self, guard_op, arglocs, regalloc):
         self._emit_guard(guard_op, arglocs)
@@ -212,42 +189,9 @@
             self._accum_reduce(op, scalar_arg, vector_loc, scalar_loc)
             accum_info = accum_info.next()
 
-    def _accum_reduce(self, op, arg, accumloc, targetloc):
-        # Currently the accumulator can ONLY be the biggest
-        # 64 bit float/int
-        tgt = targetloc.value
-        acc = accumloc.value
-        if arg.type == FLOAT:
-            # r = (r[0]+r[1],r[0]+r[1])
-            if IS_BIG_ENDIAN:
-                self.mc.xxpermdi(tgt, acc, acc, 0b00)
-            else:
-                self.mc.xxpermdi(tgt, acc, acc, 0b10)
-            if op == '+':
-                self.mc.xsadddp(tgt, tgt, acc)
-            elif op == '*':
-                self.mc.xsmuldp(tgt, tgt, acc)
-            else:
-                not_implemented("sum not implemented")
-            return
-        else:
-            assert arg.type == INT
-            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
-            self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value)
-            self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET)
-            self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8)
-            if op == '+':
-                self.mc.add(tgt, tgt, acc)
-            elif op == '*':
-                self.mc.mulld(tgt, tgt, acc)
-            else:
-                not_implemented("sum not implemented")
-            return
-
-        not_implemented("reduce sum for %s not impl." % arg)
-
     def emit_vec_int_is_true(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
+        # TODO
         resloc, argloc, sizeloc = arglocs
         size = sizeloc.value
         tmp = regalloc.vrm.get_scratch_reg(type=INT).value
@@ -266,22 +210,13 @@
 
     def emit_vec_float_eq(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        resloc, loc1, loc2, sizeloc = arglocs
+        resloc, loc0, loc1, sizeloc = arglocs
         size = sizeloc.value
-        tmp = regalloc.vrm.get_scratch_reg().value
-        offloc = regalloc.rm.get_scratch_reg()
-        off = offloc.value
-        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        if size == 4:
-            self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value)
-            self.mc.stxvw4x(tmp, off, r.SP.value)
-        elif size == 8:
-            self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value)
-            self.mc.stxvd2x(tmp, off, r.SP.value)
+        if size == 8:
+            # bit 3 in last argument sets the condition code
+            self.mc.VFCE(resloc, loc0, loc1, 3, 0, 1)
         else:
             not_implemented("[zarch/assembler] float == for size %d" % size)
-        self.mc.lvx(resloc.value, off, r.SP.value)
         flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc)
 
     def emit_vec_float_xor(self, op, arglocs, regalloc):
@@ -314,14 +249,16 @@
         flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc)
 
     def emit_vec_cast_int_to_float(self, op, arglocs, regalloc):
-        res, l0 = arglocs
+        resloc, loc0 = arglocs
         offloc = regalloc.rm.get_scratch_reg()
         off = offloc.value
         # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        self.mc.stvx(l0.value, off, r.SP.value)
-        self.mc.lxvd2x(res.value, off, r.SP.value)
-        self.mc.xvcvsxddp(res.value, res.value)
+        # bit 1 on mask4 -> supresses inexact exception
+        self.mc.VCDG(resloc, loc0, 3, 4, m.RND_TOZERO.value)
+        #self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
+        #self.mc.stvx(l0.value, off, r.SP.value)
+        #self.mc.lxvd2x(res.value, off, r.SP.value)
+        #self.mc.xvcvsxddp(res.value, res.value)
 
     def emit_vec_int_eq(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
@@ -354,6 +291,15 @@
         self.mc.vnor(res.value, res.value, res.value)
         flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res)
 
+    def emit_vec_cast_float_to_int(self, op, arglocs, regalloc):
+        res, l0 = arglocs
+        offloc = regalloc.rm.get_scratch_reg()
+        v0 = regalloc.vrm.get_scratch_reg(type=INT)
+        off = offloc.value
+        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
+        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
+        self.mc.xvcvdpsxds(res.value, l0.value)
+
     def emit_vec_expand_f(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
         resloc, srcloc = arglocs
@@ -540,14 +486,40 @@
             return
         not_implemented("unpack for combination src %d -> res %d" % (srcidx, 
residx))
 
-    def emit_vec_cast_float_to_int(self, op, arglocs, regalloc):
-        res, l0 = arglocs
-        offloc = regalloc.rm.get_scratch_reg()
-        v0 = regalloc.vrm.get_scratch_reg(type=INT)
-        off = offloc.value
-        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        self.mc.xvcvdpsxds(res.value, l0.value)
+    def _accum_reduce(self, op, arg, accumloc, targetloc):
+        # Currently the accumulator can ONLY be the biggest
+        # 64 bit float/int
+        # TODO
+        tgt = targetloc.value
+        acc = accumloc.value
+        if arg.type == FLOAT:
+            # r = (r[0]+r[1],r[0]+r[1])
+            if IS_BIG_ENDIAN:
+                self.mc.xxpermdi(tgt, acc, acc, 0b00)
+            else:
+                self.mc.xxpermdi(tgt, acc, acc, 0b10)
+            if op == '+':
+                self.mc.xsadddp(tgt, tgt, acc)
+            elif op == '*':
+                self.mc.xsmuldp(tgt, tgt, acc)
+            else:
+                not_implemented("sum not implemented")
+            return
+        else:
+            assert arg.type == INT
+            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
+            self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value)
+            self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET)
+            self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8)
+            if op == '+':
+                self.mc.add(tgt, tgt, acc)
+            elif op == '*':
+                self.mc.mulld(tgt, tgt, acc)
+            else:
+                not_implemented("sum not implemented")
+            return
+
+        not_implemented("reduce sum for %s not impl." % arg)
 
     def emit_vec_f(self, op, arglocs, regalloc):
         pass
diff --git a/rpython/jit/metainterp/test/test_vector.py 
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -133,9 +133,9 @@
 
     vec_float_unary = functools.partial(vec_int_unary, _vector_float_unary)
 
-    test_vec_abs_float = \
+    test_vec_float_abs = \
             vec_float_unary(lambda v: abs(v), rffi.DOUBLE)
-    test_vec_neg_float = \
+    test_vec_float_neg = \
             vec_float_unary(lambda v: -v, rffi.DOUBLE)
 
     # FLOAT BINARY
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy zarch-simd-support: more vector ops, int->float, comparison float

Reply via email to