Author: Richard Plangger <planri...@gmail.com>
Branch: vecopt-merge
Changeset: r79585:efc1fdaa1cee
Date: 2015-09-10 18:30 +0200
http://bitbucket.org/pypy/pypy/changeset/efc1fdaa1cee/

Log:    adapted optimization entry to fit into the new model. going forward,
        but still not yet complete to run the first simple loops

diff too long, truncating to 2000 out of 6555 lines

diff --git a/rpython/jit/backend/x86/test/test_vectorize.py 
b/rpython/jit/backend/x86/test/test_vectorize.py
deleted file mode 100644
--- a/rpython/jit/backend/x86/test/test_vectorize.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import py
-from rpython.jit.backend.x86.regloc import *
-from rpython.jit.backend.x86.test import test_basic
-from rpython.jit.backend.x86.test.test_assembler import \
-        (TestRegallocPushPop as BaseTestAssembler)
-from rpython.jit.backend.detect_cpu import getcpuclass
-from rpython.jit.metainterp.history import ConstFloat
-from rpython.jit.metainterp.test import support, test_vectorize
-from rpython.jit.metainterp.warmspot import ll_meta_interp
-from rpython.rlib.jit import JitDriver
-from rpython.rtyper.lltypesystem import lltype
-
-
-class TestBasic(test_vectorize.VectorizeLLtypeTests, test_basic.Jit386Mixin):
-    # for the individual tests see
-    # ====> ../../../metainterp/test/test_basic.py
-    enable_opts = 
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
-
-class TestAssembler(BaseTestAssembler):
-    def imm_4_int32(self, a, b, c, d):
-        adr = self.xrm.assembler.datablockwrapper.malloc_aligned(16, 16)
-        ptr = rffi.cast(rffi.CArrayPtr(rffi.INT), adr)
-        ptr[0] = rffi.r_int(a)
-        ptr[1] = rffi.r_int(b)
-        ptr[2] = rffi.r_int(c)
-        ptr[3] = rffi.r_int(d)
-        return adr
-
-    def test_simple_4_int_load_sum_x86_64(self):
-        def callback(asm):
-            if asm.mc.WORD != 8:
-                py.test.skip()
-            adr = self.imm_4_int32(123,543,0,0)
-            asm.mc.MOV_ri(r8.value,adr)
-            asm.mc.MOVDQU_xm(xmm7.value, (r8.value, 0))
-            asm.mc.PADDD_xm(xmm7.value, (r8.value, 0))
-            asm.mc.PADDD_xx(xmm7.value, xmm7.value)
-
-            asm.mc.MOV_ri(edx.value, 0x00000000ffffffff)
-
-            asm.mc.MOV_ri(eax.value, 0)
-            asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
-            asm.mc.AND_rr(ecx.value, edx.value)
-            asm.mc.ADD(eax, ecx)
-
-            asm.mc.PSRLDQ_xi(xmm7.value, 4)
-            asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
-            asm.mc.AND_rr(ecx.value, edx.value)
-            asm.mc.ADD(eax, ecx)
-        res = self.do_test(callback)
-        assert res == 123*4 + 543*4
-
-    def test_vector_store(self):
-        def callback(asm):
-            addr = self.imm_4_int32(11,12,13,14)
-            asm.mov(ImmedLoc(addr), ecx)
-            asm.mc.MOVDQU_xm(xmm6.value, (ecx.value,0))
-            asm.mc.PADDD_xm(xmm6.value, (ecx.value,0))
-            asm.mc.MOVDQU(AddressLoc(ecx,ImmedLoc(0)), xmm6)
-            asm.mc.MOVDQU(xmm6, AddressLoc(ecx,ImmedLoc(0)))
-            asm.mc.MOVDQ_rx(eax.value, xmm6.value)
-
-        res = self.do_test(callback) & 0xffffffff
-        assert res == 22
-
-
-    def test_vector_store_aligned(self):
-        def callback(asm):
-            addr = self.imm_4_int32(11,12,13,14)
-            asm.mov(ImmedLoc(addr), ecx)
-            asm.mc.MOVDQA(xmm6, AddressLoc(ecx,ImmedLoc(0)))
-            asm.mc.PADDD_xm(xmm6.value, (ecx.value,0))
-            asm.mc.MOVDQA(AddressLoc(ecx,ImmedLoc(0)), xmm6)
-            asm.mc.MOVDQA(xmm6, AddressLoc(ecx,ImmedLoc(0)))
-            asm.mc.MOVDQ_rx(eax.value, xmm6.value)
-
-        res = self.do_test(callback) & 0xffffffff
-        assert res == 22
diff --git a/rpython/jit/backend/x86/test/test_x86vector.py 
b/rpython/jit/backend/x86/test/test_x86vector.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/test/test_x86vector.py
@@ -0,0 +1,78 @@
+import py
+from rpython.jit.backend.x86.regloc import *
+from rpython.jit.backend.x86.test import test_basic
+from rpython.jit.backend.x86.test.test_assembler import \
+        (TestRegallocPushPop as BaseTestAssembler)
+from rpython.jit.backend.detect_cpu import getcpuclass
+from rpython.jit.metainterp.history import ConstFloat
+from rpython.jit.metainterp.test import support, test_metavec
+from rpython.jit.metainterp.warmspot import ll_meta_interp
+from rpython.rlib.jit import JitDriver
+from rpython.rtyper.lltypesystem import lltype
+
+
+class TestBasic(test_metavec.VectorizeLLtypeTests, test_basic.Jit386Mixin):
+    # for the individual tests see
+    # ====> ../../../metainterp/test/test_basic.py
+    enable_opts = 
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
+
+class TestAssembler(BaseTestAssembler):
+    def imm_4_int32(self, a, b, c, d):
+        adr = self.xrm.assembler.datablockwrapper.malloc_aligned(16, 16)
+        ptr = rffi.cast(rffi.CArrayPtr(rffi.INT), adr)
+        ptr[0] = rffi.r_int(a)
+        ptr[1] = rffi.r_int(b)
+        ptr[2] = rffi.r_int(c)
+        ptr[3] = rffi.r_int(d)
+        return adr
+
+    def test_simple_4_int_load_sum_x86_64(self):
+        def callback(asm):
+            if asm.mc.WORD != 8:
+                py.test.skip()
+            adr = self.imm_4_int32(123,543,0,0)
+            asm.mc.MOV_ri(r8.value,adr)
+            asm.mc.MOVDQU_xm(xmm7.value, (r8.value, 0))
+            asm.mc.PADDD_xm(xmm7.value, (r8.value, 0))
+            asm.mc.PADDD_xx(xmm7.value, xmm7.value)
+
+            asm.mc.MOV_ri(edx.value, 0x00000000ffffffff)
+
+            asm.mc.MOV_ri(eax.value, 0)
+            asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
+            asm.mc.AND_rr(ecx.value, edx.value)
+            asm.mc.ADD(eax, ecx)
+
+            asm.mc.PSRLDQ_xi(xmm7.value, 4)
+            asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
+            asm.mc.AND_rr(ecx.value, edx.value)
+            asm.mc.ADD(eax, ecx)
+        res = self.do_test(callback)
+        assert res == 123*4 + 543*4
+
+    def test_vector_store(self):
+        def callback(asm):
+            addr = self.imm_4_int32(11,12,13,14)
+            asm.mov(ImmedLoc(addr), ecx)
+            asm.mc.MOVDQU_xm(xmm6.value, (ecx.value,0))
+            asm.mc.PADDD_xm(xmm6.value, (ecx.value,0))
+            asm.mc.MOVDQU(AddressLoc(ecx,ImmedLoc(0)), xmm6)
+            asm.mc.MOVDQU(xmm6, AddressLoc(ecx,ImmedLoc(0)))
+            asm.mc.MOVDQ_rx(eax.value, xmm6.value)
+
+        res = self.do_test(callback) & 0xffffffff
+        assert res == 22
+
+
+    def test_vector_store_aligned(self):
+        def callback(asm):
+            addr = self.imm_4_int32(11,12,13,14)
+            asm.mov(ImmedLoc(addr), ecx)
+            asm.mc.MOVDQA(xmm6, AddressLoc(ecx,ImmedLoc(0)))
+            asm.mc.PADDD_xm(xmm6.value, (ecx.value,0))
+            asm.mc.MOVDQA(AddressLoc(ecx,ImmedLoc(0)), xmm6)
+            asm.mc.MOVDQA(xmm6, AddressLoc(ecx,ImmedLoc(0)))
+            asm.mc.MOVDQ_rx(eax.value, xmm6.value)
+
+        res = self.do_test(callback) & 0xffffffff
+        assert res == 22
diff --git a/rpython/jit/metainterp/compile.py 
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -255,6 +255,12 @@
     history = metainterp.history
     warmstate = jitdriver_sd.warmstate
 
+    enable_opts = jitdriver_sd.warmstate.enable_opts
+    if try_disabling_unroll:
+        if 'unroll' not in enable_opts:
+            return None
+        enable_opts = enable_opts.copy()
+        del enable_opts['unroll']
 
     ops = history.operations[start:]
     if 'unroll' not in enable_opts:
@@ -292,6 +298,12 @@
                                              metainterp.box_names_memo)
     except InvalidLoop:
         return None
+
+    if ((warmstate.vec and jitdriver_sd.vec) or warmstate.vec_all):
+        from rpython.jit.metainterp.optimizeopt.vector import optimize_vector
+        loop_info, loop_ops = optimize_vector(metainterp_sd,
+                                              jitdriver_sd, warmstate,
+                                              loop_info, loop_ops)
     #
     loop = create_empty_loop(metainterp)
     loop.original_jitcell_token = jitcell_token
@@ -312,16 +324,15 @@
             label_token.short_preamble, metainterp.box_names_memo)
     loop.operations = ([start_label] + preamble_ops + loop_info.extra_same_as +
                        [loop_info.label_op] + loop_ops)
+    if loop.versions is not None:
+        # every different loop version must update their target tokens
+        for version in loop.versions:
+            version.update_token(jitcell_token, all_target_tokens)
     if not we_are_translated():
         loop.check_consistency()
     send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, "loop",
                          inputargs, metainterp.box_names_memo)
 
-    # XXX if loop.versions is not None:
-    #    # every different loop version must update their target tokens
-    #    for version in loop.versions:
-    #        version.update_token(jitcell_token, all_target_tokens)
-
     record_loop_or_bridge(metainterp_sd, loop)
     generate_pending_loop_versions(loop, jitdriver_sd, metainterp, 
jitcell_token)
     return start_descr
diff --git a/rpython/jit/metainterp/history.py 
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -481,111 +481,6 @@
     def repr_of_descr(self):
         return 'TargetToken(%d)' % compute_unique_id(self)
 
-def index_of_first(opnum, operations, pass_by=0):
-    """ returns the position of the first operation matching the opnum.
-    Or -1 if non is found
-    """
-    for i,op in enumerate(operations):
-        if op.getopnum() == opnum:
-            if pass_by == 0:
-                return i
-            else:
-                pass_by -= 1
-    return -1
-
-class VersionInfo(object):
-    def __init__(self):
-        self.descrs = []
-        self.leads_to = {}
-        self.insert_index = -1
-
-    def mark(self):
-        self.insert_index = len(self.descrs)
-
-    def clear(self):
-        self.insert_index = -1
-
-    def track(self, op, descr, version):
-        assert descr.loop_version()
-        i = self.insert_index
-        if i >= 0:
-            assert i >= 0
-            self.descrs.insert(i, descr)
-        else:
-            self.descrs.append(descr)
-        self.leads_to[descr] = version
-        # note: stitching a guard must resemble the order of the label
-        # otherwise a wrong mapping is handed to the register allocator
-        op.setfailargs(version.renamed_inputargs)
-        assert version.renamed_inputargs is not None
-
-    def remove(self, descr):
-        if descr in self.leads_to:
-            del self.leads_to[descr]
-        else:
-            assert 0, "could not remove %s" % descr
-
-    def get(self, descr):
-        return self.leads_to.get(descr, None)
-
-class LoopVersion(object):
-    """ A special version of a trace loop. Use loop.snaphost() to
-        create one instance and attach it to a guard descr.
-        If not attached to a descriptor, it will not be compiled.
-    """
-    inputargs = None
-    renamed_inputargs = None
-
-    def __init__(self, operations):
-        self.operations = operations
-        idx = index_of_first(rop.LABEL, self.operations)
-        assert idx >= 0
-        label = self.operations[idx]
-        self.inputargs = label.getarglist()
-        self.renamed_inputargs = label.getarglist()
-
-    def setup_once(self, info):
-        for op in self.operations:
-            if op.is_guard():
-                olddescr = op.getdescr()
-                if not olddescr:
-                    continue
-                descr = olddescr.clone()
-                op.setdescr(descr)
-                if descr.loop_version():
-                    toversion = info.leads_to.get(olddescr,None)
-                    if toversion:
-                        info.track(op, descr, toversion)
-                    else:
-                        assert 0, "olddescr must be found"
-
-    def update_token(self, jitcell_token, all_target_tokens):
-        # this is only invoked for versioned loops!
-        label_index = index_of_first(rop.LABEL, self.operations, 0)
-        label = self.operations[label_index]
-        jump = self.operations[-1]
-        #
-        assert jump.getopnum() == rop.JUMP
-        #
-        token = TargetToken(jitcell_token)
-        token.original_jitcell_token = jitcell_token
-        all_target_tokens.append(token)
-        if label.getdescr() is None or label.getdescr() is not jump.getdescr():
-            label_index = index_of_first(rop.LABEL, self.operations, 1)
-            if label_index > 0:
-                second_label = self.operations[label_index]
-                # set the inner loop
-                second_label.setdescr(token)
-                jump.setdescr(token)
-                # set the first label
-                token = TargetToken(jitcell_token)
-                token.original_jitcell_token = jitcell_token
-                all_target_tokens.append(token)
-                label.setdescr(token)
-                return
-        label.setdescr(token)
-        jump.setdescr(token)
-
 class TreeLoop(object):
     inputargs = None
     operations = None
@@ -607,7 +502,6 @@
     def __init__(self, name):
         self.name = name
         self.versions = []
-        self.version_info = VersionInfo()
         # self.operations = list of ResOperations
         #   ops of the kind 'guard_xxx' contain a further list of operations,
         #   which may itself contain 'guard_xxx' and so on, making a tree.
@@ -626,30 +520,30 @@
             insns[opname] = insns.get(opname, 0) + 1
         return insns
 
-    def append_loop(self, loop, all_target_tokens):
-        # append e.g. the peeled loop to this loop!
-        jump = loop.operations[-1]
-        assert jump.getdescr() is not None
-        target_token = None
-        i = 0
-        # adds all target token until the one is found that jumps from the 
-        # last instruction to the label
-        while i < len(loop.operations) and target_token is not jump.getdescr():
-            # there is another label
-            op = loop.operations[i]
-            if op.getopnum() == rop.LABEL:
-                target_token = op.getdescr()
-                assert isinstance(target_token, TargetToken)
-                all_target_tokens.append(target_token)
-            i += 1
-        #
-        self.operations = self.operations[:-1] + loop.operations
-        self.versions = loop.versions
-        loop.versions = None
-        self.version_info = loop.version_info
-        loop.version_info = None
-        if loop.quasi_immutable_deps:
-            self.quasi_immutable_deps.update(loop.quasi_immutable_deps)
+    # XXX VECdef append_loop(self, loop, all_target_tokens):
+    # XXX VEC    # append e.g. the peeled loop to this loop!
+    # XXX VEC    jump = loop.operations[-1]
+    # XXX VEC    assert jump.getdescr() is not None
+    # XXX VEC    target_token = None
+    # XXX VEC    i = 0
+    # XXX VEC    # adds all target token until the one is found that jumps 
from the 
+    # XXX VEC    # last instruction to the label
+    # XXX VEC    while i < len(loop.operations) and target_token is not 
jump.getdescr():
+    # XXX VEC        # there is another label
+    # XXX VEC        op = loop.operations[i]
+    # XXX VEC        if op.getopnum() == rop.LABEL:
+    # XXX VEC            target_token = op.getdescr()
+    # XXX VEC            assert isinstance(target_token, TargetToken)
+    # XXX VEC            all_target_tokens.append(target_token)
+    # XXX VEC        i += 1
+    # XXX VEC    #
+    # XXX VEC    self.operations = self.operations[:-1] + loop.operations
+    # XXX VEC    self.versions = loop.versions
+    # XXX VEC    loop.versions = None
+    # XXX VEC    self.version_info = loop.version_info
+    # XXX VEC    loop.version_info = None
+    # XXX VEC    if loop.quasi_immutable_deps:
+    # XXX VEC        
self.quasi_immutable_deps.update(loop.quasi_immutable_deps)
 
     def get_operations(self):
         return self.operations
@@ -659,34 +553,6 @@
             self.name,
             ', '.join([box.repr(memo) for box in self.inputargs]))
 
-    def find_first_index(self, opnum, pass_by=0):
-        """ return the first index of the operation having the same opnum or 
-1 """
-        return index_of_first(opnum, self.operations, pass_by)
-
-    def find_first(self, opnum, pass_by=0):
-        index = self.find_first_index(opnum, pass_by)
-        if index != -1:
-            return self.operations[index]
-        return None
-
-    def snapshot(self):
-        oplist = self.copy_operations(self.operations)
-        version = LoopVersion(oplist)
-        version.setup_once(self.version_info)
-        # register the faildescr for later stitching
-        self.versions.append(version)
-        return version
-
-    def copy_operations(self, operations):
-        ignore = (rop.DEBUG_MERGE_POINT,)
-        oplist = []
-        for op in operations:
-            if op.getopnum() in ignore:
-                continue
-            cloned = op.clone()
-            oplist.append(cloned)
-        return oplist
-
     def get_display_text(self):    # for graphpage.py
         return self.name + '\n' + repr(self.inputargs)
 
diff --git a/rpython/jit/metainterp/optimizeopt/__init__.py 
b/rpython/jit/metainterp/optimizeopt/__init__.py
--- a/rpython/jit/metainterp/optimizeopt/__init__.py
+++ b/rpython/jit/metainterp/optimizeopt/__init__.py
@@ -7,12 +7,10 @@
 from rpython.jit.metainterp.optimizeopt.simplify import OptSimplify
 from rpython.jit.metainterp.optimizeopt.pure import OptPure
 from rpython.jit.metainterp.optimizeopt.earlyforce import OptEarlyForce
-from rpython.jit.metainterp.optimizeopt.vectorize import optimize_vector
 from rpython.rlib.jit import PARAMETERS, ENABLE_ALL_OPTS
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rlib.debug import debug_start, debug_stop, debug_print
 
-
 ALL_OPTS = [('intbounds', OptIntBounds),
             ('rewrite', OptRewrite),
             ('virtualize', OptVirtualize),
@@ -54,16 +52,9 @@
     """
     debug_start("jit-optimize")
     inputargs = compile_data.start_label.getarglist()
-    enable_opts = warmstate.enable_opts
-    if try_disabling_unroll:
-        if 'unroll' not in enable_opts:
-            return None
-        enable_opts = enable_opts.copy()
-        del enable_opts['unroll']
 
     try:
-        metainterp_sd.logger_noopt.log_loop(inputargs,
-                                            compile_data.operations,
+        metainterp_sd.logger_noopt.log_loop(inputargs, compile_data.operations,
                                             memo=memo)
         if memo is None:
             memo = {}
@@ -72,21 +63,6 @@
                                                 compile_data.enable_opts)
         return compile_data.optimize(metainterp_sd, jitdriver_sd,
                                      optimizations, unroll)
-        # XXX if unroll:
-        # XXX     if not export_state and \
-        # XXX         ((warmstate.vec and jitdriver_sd.vec) \
-        # XXX          or warmstate.vec_all):
-        # XXX         optimize_vector(metainterp_sd, jitdriver_sd, loop,
-        # XXX                         optimizations, inline_short_preamble,
-        # XXX                         start_state, warmstate)
-        # XXX     else:
-        # XXX         return optimize_unroll(metainterp_sd, jitdriver_sd, loop,
-        # XXX                                optimizations, 
inline_short_preamble,
-        # XXX                                start_state, export_state)
-        # XXX else:
-        # XXX     optimizer = Optimizer(metainterp_sd, jitdriver_sd, loop,
-        # XXX                           optimizations)
-        # XXX     optimizer.propagate_all_forward()
     finally:
         compile_data.forget_optimization_info()
         debug_stop("jit-optimize")
diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py 
b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -444,7 +444,14 @@
         for _def in self.defs[arg]:
             yield _def[0]
 
+    def is_defined(self, arg):
+        return arg in self.defs
+
     def definition(self, arg, node=None, argcell=None):
+        if arg.is_constant():
+            return None
+        if arg.is_inputarg():
+            return None
         def_chain = self.defs[arg]
         if len(def_chain) == 1:
             return def_chain[0][0]
@@ -473,6 +480,8 @@
     def depends_on_arg(self, arg, to, argcell=None):
         try:
             at = self.definition(arg, to, argcell)
+            if at is None:
+                return
             at.edge_to(to, arg)
         except KeyError:
             if not we_are_translated():
@@ -501,7 +510,9 @@
     """
     def __init__(self, loop):
         self.loop = loop
-        self.nodes = [ Node(op,i) for i,op in enumerate(loop.operations) ]
+        self.label = Node(loop.label, 0)
+        self.nodes = [ Node(op,i+1) for i,op in enumerate(loop.operations) ]
+        self.jump = Node(loop.jump, len(self.nodes)+1)
         self.invariant_vars = {}
         self.update_invariant_vars()
         self.memory_refs = {}
@@ -515,8 +526,8 @@
         return self.nodes[i]
 
     def update_invariant_vars(self):
-        label_op = self.nodes[0].getoperation()
-        jump_op = self.nodes[-1].getoperation()
+        label_op = self.label.getoperation()
+        jump_op = self.jump.getoperation()
         assert label_op.numargs() == jump_op.numargs()
         for i in range(label_op.numargs()):
             label_box = label_op.getarg(i)
@@ -668,6 +679,8 @@
 
     def guard_exit_dependence(self, guard_node, var, tracker):
         def_node = tracker.definition(var)
+        if def_node is None:
+            return
         for dep in def_node.provides():
             if guard_node.is_before(dep.to) and dep.because_of(var):
                 guard_node.edge_to(dep.to, var, 
label='guard_exit('+str(var)+')')
@@ -690,7 +703,7 @@
         # handle fail args
         if guard_op.getfailargs():
             for arg in guard_op.getfailargs():
-                if arg is None:
+                if arg is None or not tracker.is_defined(arg):
                     continue
                 try:
                     for at in tracker.redefinitions(arg):
@@ -728,10 +741,11 @@
                             # A trace is not entirely in SSA form. complex 
object
                             # modification introduces WAR/WAW dependencies
                             def_node = tracker.definition(arg)
-                            for dep in def_node.provides():
-                                if dep.to != node:
-                                    dep.to.edge_to(node, argcell, label='war')
-                            def_node.edge_to(node, argcell)
+                            if def_node:
+                                for dep in def_node.provides():
+                                    if dep.to != node:
+                                        dep.to.edge_to(node, argcell, 
label='war')
+                                def_node.edge_to(node, argcell)
                         except KeyError:
                             pass
                     else:
diff --git a/rpython/jit/metainterp/optimizeopt/guard.py 
b/rpython/jit/metainterp/optimizeopt/guard.py
--- a/rpython/jit/metainterp/optimizeopt/guard.py
+++ b/rpython/jit/metainterp/optimizeopt/guard.py
@@ -261,14 +261,14 @@
         #
         loop.operations = self._newoperations[:]
 
-    def propagate_all_forward(self, loop, user_code=False):
+    def propagate_all_forward(self, info, loop, user_code=False):
         """ strengthens the guards that protect an integral value """
         # the guards are ordered. guards[i] is before guards[j] iff i < j
         self.collect_guard_information(loop)
         self.eliminate_guards(loop)
         #
-        assert len(loop.versions) == 1
-        version = loop.versions[0]
+        assert len(info.versions) == 1
+        version = info.versions[0]
 
         for i,op in enumerate(loop.operations):
             if not op.is_guard():
@@ -276,10 +276,10 @@
             descr = op.getdescr()
             if descr and descr.loop_version():
                 assert isinstance(descr, ResumeGuardDescr)
-                loop.version_info.track(op, descr, version)
+                info.track(op, descr, version)
 
         if user_code:
-            self.eliminate_array_bound_checks(loop)
+            self.eliminate_array_bound_checks(info, loop)
 
     def emit_operation(self, op):
         self.renamer.rename(op)
@@ -288,8 +288,7 @@
     def operation_position(self):
         return len(self._newoperations)
 
-    def eliminate_array_bound_checks(self, loop):
-        info = loop.version_info
+    def eliminate_array_bound_checks(self, info, loop):
         info.mark()
         version = None
         self._newoperations = []
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py 
b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
@@ -6,6 +6,7 @@
 from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop
 from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, 
Dependency,
         IndexVar, MemoryRef, Node)
+from rpython.jit.metainterp.optimizeopt.vector import TraceLoop
 from rpython.jit.metainterp.resoperation import rop, ResOperation
 from rpython.jit.backend.llgraph.runner import ArrayDescr
 from rpython.rtyper.lltypesystem import rffi
@@ -28,10 +29,9 @@
     def parse_loop(self, ops):
         loop = self.parse(ops, postprocess=self.postprocess)
         token = JitCellToken()
-        loop.operations = [ResOperation(rop.LABEL, loop.inputargs,
-                                   descr=TargetToken(token))] + loop.operations
-        if loop.operations[-1].getopnum() == rop.JUMP:
-            loop.operations[-1].setdescr(token)
+        label = ResOperation(rop.LABEL, loop.inputargs, 
descr=TargetToken(token))
+        loop = TraceLoop(label, loop.operations[:-1], loop.operations[-1])
+        loop.jump.setdescr(token)
         return loop
 
     def assert_edges(self, graph, edge_list, exceptions):
@@ -39,13 +39,17 @@
         adding None instead of a list of integers skips the test.
         This checks both if a dependency forward and backward exists.
         """
-        assert len(edge_list) == len(graph.nodes)
+        assert len(edge_list) == len(graph.nodes) + 2
+        edge_list = edge_list[1:-1]
         for idx,edges in enumerate(edge_list):
             if edges is None:
                 continue
             node_a = graph.getnode(idx)
             dependencies = node_a.provides()[:]
             for idx_b in edges:
+                if idx_b == 0 or idx_b >= len(graph.nodes) + 2 -1:
+                    continue
+                idx_b -= 1
                 node_b = graph.getnode(idx_b)
                 dependency = node_a.getedge_to(node_b)
                 if dependency is None and idx_b not in 
exceptions.setdefault(idx,[]):
@@ -87,11 +91,17 @@
         return graph
 
     def assert_independent(self, a, b):
+        # XXX
+        a -= 1
+        b -= 1
         a = self.last_graph.getnode(a)
         b = self.last_graph.getnode(b)
         assert a.independent(b), "{a} and {b} are dependent!".format(a=a,b=b)
 
     def assert_dependent(self, a, b):
+        # XXX
+        a -= 1
+        b -= 1
         a = self.last_graph.getnode(a)
         b = self.last_graph.getnode(b)
         assert not a.independent(b), "{a} and {b} are 
independent!".format(a=a,b=b)
@@ -204,9 +214,6 @@
         jump() # 4:
         """
         graph = self.assert_dependencies(ops, full_check=True)
-        self.assert_independent(0,1)
-        self.assert_independent(0,2)
-        self.assert_independent(0,3)
         self.assert_dependent(1,2)
         self.assert_dependent(2,3)
         self.assert_dependent(1,3)
@@ -231,9 +238,9 @@
 
     def test_dependency_guard_2(self):
         ops = """
-        [i1] # 0: 1,2?,3?
+        [i1] # 0: 1,2?,3
         i2 = int_le(i1, 10) # 1: 2
-        guard_true(i2) [i1] # 2: 3
+        guard_true(i2) [i1] # 2:
         i3 = int_add(i1,1) # 3: 4
         jump(i3) # 4:
         """
@@ -243,7 +250,7 @@
         ops = """
         [i1] # 0: 1,2?,3
         i2 = int_lt(i1,10) # 1: 2
-        guard_false(i2) [i1] # 2: 3
+        guard_false(i2) [i1] # 2:
         i3 = int_add(i1,i1) # 3: 4
         jump(i3) # 4:
         """
@@ -257,15 +264,12 @@
         jump(i1) # 3:
         """
         self.assert_dependencies(ops, full_check=True)
-        self.assert_dependent(0,1)
-        self.assert_dependent(0,2)
-        self.assert_dependent(0,3)
 
     def test_dependencies_1(self):
         ops="""
         [i0, i1, i2] # 0: 1,3,6,7,11?
         i4 = int_gt(i1, 0) # 1: 2
-        guard_true(i4) [] # 2: 3, 5, 11?
+        guard_true(i4) [] # 2: 5, 11?
         i6 = int_sub(i1, 1) # 3: 4
         i8 = int_gt(i6, 0) # 4: 5
         guard_false(i8) [] # 5: 10
@@ -279,7 +283,6 @@
         self.assert_dependencies(ops, full_check=True)
         self.assert_independent(6, 2)
         self.assert_independent(6, 1)
-        self.assert_dependent(6, 0)
 
     def test_prevent_double_arg(self):
         ops="""
@@ -313,7 +316,7 @@
         [p0, p1, i2] # 0: 1,2?,3?,4?,5?
         i3 = int_add(i2,1) # 1: 2
         i4 = call_i(p0, i3, descr=nonwritedescr) # 2: 3,4,5?
-        guard_no_exception() [i2] # 3: 4?,5?
+        guard_no_exception() [i2] # 3:
         p2 = getarrayitem_gc_r(p1, i3, descr=arraydescr) # 4: 5
         jump(p2, p1, i3) # 5:
         """
@@ -419,7 +422,6 @@
         self.assert_dependencies(trace, full_check=True)
 
     def test_cyclic(self):
-        pass 
         trace = """
         [p0, p1, p5, p6, p7, p9, p11, p12] # 0: 1,6
         guard_early_exit() [] # 1: 2,4,6,7
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py 
b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
@@ -1,15 +1,16 @@
 import py
 
 from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop
-from rpython.jit.metainterp.optimizeopt.util import equaloplists, Renamer
-from rpython.jit.metainterp.optimizeopt.vectorize import (VecScheduleData,
+from rpython.jit.metainterp.optimizeopt.util import equaloplists
+from rpython.jit.metainterp.optimizeopt.renamer import Renamer
+from rpython.jit.metainterp.optimizeopt.vec import (VecScheduleData,
         Pack, Pair, NotAProfitableLoop, VectorizingOptimizer, X86_CostModel,
         PackSet)
 from rpython.jit.metainterp.optimizeopt.dependency import Node, DependencyGraph
 from rpython.jit.metainterp.optimizeopt.schedule import PackType
 from rpython.jit.metainterp.optimizeopt.test.test_util import LLtypeMixin
 from rpython.jit.metainterp.optimizeopt.test.test_dependency import 
DependencyBaseTest
-from rpython.jit.metainterp.optimizeopt.test.test_vectorize import 
(FakeMetaInterpStaticData,
+from rpython.jit.metainterp.optimizeopt.test.test_vecopt import 
(FakeMetaInterpStaticData,
         FakeJitDriverStaticData)
 from rpython.jit.metainterp.resoperation import rop, ResOperation
 from rpython.jit.tool.oparser import parse as opparse
@@ -42,8 +43,8 @@
     def namespace(self):
         return {
             'double': self.floatarraydescr,
-            'float': self.singlefloatarraydescr,
-            'long': self.intarraydescr,
+            'float': self.float32arraydescr,
+            'long': self.arraydescr,
             'int': self.int32arraydescr,
             'short': self.int16arraydescr,
             'char': self.chararraydescr,
@@ -77,8 +78,7 @@
         loop = opparse(src, cpu=self.cpu, namespace=self.namespace())
         if inc_label_jump:
             token = JitCellToken()
-            label = ResOperation(rop.LABEL, loop.inputargs,
-                                 None, descr=TargetToken(token))
+            label = ResOperation(rop.LABEL, loop.inputargs, 
descr=TargetToken(token))
             loop.operations = [label] + loop.operations
             loop.graph = FakeDependencyGraph(loop)
             return loop
@@ -138,19 +138,19 @@
 class Test(SchedulerBaseTest, LLtypeMixin):
     def test_schedule_split_load(self):
         loop1 = self.parse("""
-        i10 = raw_load(p0, i0, descr=float)
-        i11 = raw_load(p0, i1, descr=float)
-        i12 = raw_load(p0, i2, descr=float)
-        i13 = raw_load(p0, i3, descr=float)
-        i14 = raw_load(p0, i4, descr=float)
-        i15 = raw_load(p0, i5, descr=float)
+        f10 = raw_load_f(p0, i0, descr=float)
+        f11 = raw_load_f(p0, i1, descr=float)
+        f12 = raw_load_f(p0, i2, descr=float)
+        f13 = raw_load_f(p0, i3, descr=float)
+        f14 = raw_load_f(p0, i4, descr=float)
+        f15 = raw_load_f(p0, i5, descr=float)
         """)
         pack1 = self.pack(loop1, 0, 6, None, F32)
         loop2 = self.schedule(loop1, [pack1])
         loop3 = self.parse("""
         v10[i32|4] = vec_raw_load(p0, i0, 4, descr=float)
-        f10 = raw_load(p0, i4, descr=float)
-        f11 = raw_load(p0, i5, descr=float)
+        f10 = raw_load_f(p0, i4, descr=float)
+        f11 = raw_load_f(p0, i5, descr=float)
         """, False)
         self.assert_equal(loop2, loop3)
 
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_util.py 
b/rpython/jit/metainterp/optimizeopt/test/test_util.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_util.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_util.py
@@ -199,7 +199,10 @@
     immut_ptrval = cpu.fielddescrof(PTROBJ_IMMUT, 'ptrval')
 
     arraydescr = cpu.arraydescrof(lltype.GcArray(lltype.Signed))
+    int32arraydescr = cpu.arraydescrof(lltype.GcArray(rffi.INT))
+    int16arraydescr = cpu.arraydescrof(lltype.GcArray(rffi.SHORT))
     floatarraydescr = cpu.arraydescrof(lltype.GcArray(lltype.Float))
+    float32arraydescr = cpu.arraydescrof(lltype.GcArray(lltype.SingleFloat))
     arraydescr_tid = arraydescr.get_type_id()
     array = lltype.malloc(lltype.GcArray(lltype.Signed), 15, zero=True)
     arrayref = lltype.cast_opaque_ptr(llmemory.GCREF, array)
@@ -207,7 +210,6 @@
     array2ref = lltype.cast_opaque_ptr(llmemory.GCREF, array2)
     gcarraydescr = cpu.arraydescrof(lltype.GcArray(llmemory.GCREF))
     gcarraydescr_tid = gcarraydescr.get_type_id()
-    floatarraydescr = cpu.arraydescrof(lltype.GcArray(lltype.Float))
 
     # a GcStruct not inheriting from OBJECT
     tpl = lltype.malloc(S, zero=True)
@@ -403,6 +405,15 @@
     failargs_limit = 1000
     storedebug = None
 
+class FakeWarmState(object):
+    vec = True # default is on
+    vec_all = False
+    vec_cost = 0
+    def __init__(self, enable_opts):
+        self.enable_opts = enable_opts
+
+class FakeJitDriverStaticData(object):
+    vec = False
 
 class FakeMetaInterpStaticData(object):
 
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py 
b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vecopt.py
@@ -0,0 +1,1474 @@
+import py
+import pytest
+
+from rpython.rlib.objectmodel import instantiate
+from rpython.jit.metainterp.optimizeopt.test.test_util import (LLtypeMixin,
+        FakeMetaInterpStaticData, convert_old_style_to_targets,
+        FakeWarmState)
+from rpython.jit.metainterp.optimizeopt.test.test_dependency import 
DependencyBaseTest
+from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop
+from rpython.jit.metainterp.optimizeopt import optimize_trace
+import rpython.jit.metainterp.optimizeopt.optimizer as optimizeopt
+import rpython.jit.metainterp.optimizeopt.virtualize as virtualize
+from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph
+from rpython.jit.metainterp.optimizeopt.vectorize import 
(VectorizingOptimizer, MemoryRef,
+        isomorphic, Pair, NotAVectorizeableLoop, NotAProfitableLoop, 
GuardStrengthenOpt,
+        CostModel)
+from rpython.jit.metainterp.optimize import InvalidLoop
+from rpython.jit.metainterp import compile
+from rpython.jit.metainterp.resoperation import rop, ResOperation
+
+class FakeJitDriverStaticData(object):
+    vec=True
+
+class FakeCostModel(CostModel):
+    def __init__(self):
+        CostModel.__init__(self, 0, 16)
+    def record_cast_int(self): pass
+    def record_pack_savings(self, pack, times): pass
+    def record_vector_pack(self, box, index, count): pass
+    def record_vector_unpack(self, box, index, count): pass
+    def unpack_cost(self, op, index, count): pass
+    def savings_for_pack(self, pack, times): pass
+    def profitable(self):
+        return True
+
+ARCH_VEC_REG_SIZE = 16
+
+class VecTestHelper(DependencyBaseTest):
+
+    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap"
+
+    jitdriver_sd = FakeJitDriverStaticData()
+
+    def parse_loop(self, ops, add_label=True):
+        loop = self.parse(ops, postprocess=self.postprocess)
+        token = JitCellToken()
+        pre = []
+        tt = TargetToken(token)
+        if add_label:
+            pre = [ResOperation(rop.LABEL, loop.inputargs, None, descr=tt)]
+        else:
+            for i,op in enumerate(loop.operations):
+                if op.getopnum() == rop.LABEL:
+                    op.setdescr(tt)
+        loop.operations = pre + filter(lambda op: op.getopnum() != 
rop.DEBUG_MERGE_POINT, loop.operations)
+        if loop.operations[-1].getopnum() == rop.JUMP:
+            loop.operations[-1].setdescr(token)
+        for op in loop.operations:
+            if op.getopnum() == rop.GUARD_EARLY_EXIT and op.getdescr() is None:
+                op.setdescr(compile.ResumeAtLoopHeaderDescr())
+        return loop
+
+    def assert_vectorize(self, loop, expected_loop, call_pure_results=None):
+        self._do_optimize_loop(loop, call_pure_results, export_state=True)
+        self.assert_equal(loop, expected_loop)
+
+    def vectoroptimizer(self, loop):
+        metainterp_sd = FakeMetaInterpStaticData(self.cpu)
+        jitdriver_sd = FakeJitDriverStaticData()
+        opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 0)
+        label_index = loop.find_first_index(rop.LABEL)
+        opt.orig_label_args = loop.operations[label_index].getarglist()[:]
+        return opt
+
+    def vectoroptimizer_unrolled(self, loop, unroll_factor = -1):
+        loop.snapshot()
+        opt = self.vectoroptimizer(loop)
+        opt.linear_find_smallest_type(loop)
+        if unroll_factor == -1 and opt.smallest_type_bytes == 0:
+            raise NotAVectorizeableLoop()
+        if unroll_factor == -1:
+            unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE)
+            print ""
+            print "unroll factor: ", unroll_factor, opt.smallest_type_bytes
+        if opt.loop.find_first_index(rop.GUARD_EARLY_EXIT) == -1:
+            idx = loop.find_first_index(rop.LABEL)
+            guard = ResOperation(rop.GUARD_EARLY_EXIT, [], None)
+            guard.setfailargs([])
+            guard.setdescr(compile.ResumeAtLoopHeaderDescr())
+            loop.operations.insert(idx+1, guard)
+        self.show_dot_graph(DependencyGraph(opt.loop), "original_" + 
self.test_name)
+        opt.analyse_index_calculations()
+        if opt.dependency_graph is not None:
+            cycle = opt.dependency_graph.cycles()
+            if cycle is not None:
+                print "CYCLE found %s" % cycle
+            self.show_dot_graph(opt.dependency_graph, "early_exit_" + 
self.test_name)
+            assert cycle is None
+            opt.schedule(False)
+        opt.unroll_loop_iterations(loop, unroll_factor)
+        opt.loop.operations = opt.get_newoperations()
+        self.debug_print_operations(opt.loop)
+        opt.clear_newoperations()
+        opt.dependency_graph = DependencyGraph(loop)
+        self.last_graph = opt.dependency_graph
+        self.show_dot_graph(self.last_graph, self.test_name)
+        return opt
+
+    def init_packset(self, loop, unroll_factor = -1):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.find_adjacent_memory_refs()
+        return opt
+
+    def extend_packset(self, loop, unroll_factor = -1):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.find_adjacent_memory_refs()
+        opt.extend_packset()
+        return opt
+
+    def combine_packset(self, loop, unroll_factor = -1):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.find_adjacent_memory_refs()
+        opt.extend_packset()
+        opt.combine_packset()
+        return opt
+
+    def schedule(self, loop, unroll_factor = -1, with_guard_opt=False):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.costmodel = FakeCostModel()
+        opt.find_adjacent_memory_refs()
+        opt.extend_packset()
+        opt.combine_packset()
+        opt.schedule(True)
+        if with_guard_opt:
+            gso = GuardStrengthenOpt(opt.dependency_graph.index_vars, 
opt.has_two_labels)
+            gso.propagate_all_forward(opt.loop)
+        return opt
+
+    def vectorize(self, loop, unroll_factor = -1):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.find_adjacent_memory_refs()
+        opt.extend_packset()
+        opt.combine_packset()
+        opt.costmodel.reset_savings()
+        opt.schedule(True)
+        if not opt.costmodel.profitable():
+            raise NotAProfitableLoop()
+        gso = GuardStrengthenOpt(opt.dependency_graph.index_vars, 
opt.has_two_labels)
+        gso.propagate_all_forward(opt.loop)
+        return opt
+
+    def assert_unroll_loop_equals(self, loop, expected_loop, \
+                     unroll_factor = -1):
+        vectoroptimizer = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        self.assert_equal(loop, expected_loop)
+
+    def assert_pack(self, pack, indices):
+        assert len(pack.operations) == len(indices)
+        for op,i in zip(pack.operations, indices):
+            assert op.opidx == i
+
+    def assert_has_pack_with(self, packset, opindices):
+        for pack in packset.packs:
+            for op,i in zip(pack.operations, opindices):
+                if op.opidx != i:
+                    break
+            else:
+                # found a pack that points to the specified operations
+                break
+        else:
+            pytest.fail("could not find a packset that points to %s" % 
str(opindices))
+
+    def assert_packset_empty(self, packset, instr_count, exceptions):
+        for a,b in exceptions:
+            self.assert_packset_contains_pair(packset, a, b)
+        import itertools
+        combintations = set(itertools.product(range(instr_count),
+                                              range(instr_count)))
+        combintations -= set(exceptions)
+        for a,b in combintations:
+            self.assert_packset_not_contains_pair(packset, a, b)
+
+    def assert_packset_not_contains_pair(self, packset, x, y):
+        for pack in packset.packs:
+            if pack.left.opidx == x and \
+               pack.right.opidx == y:
+                pytest.fail("must not find packset with indices {x},{y}" \
+                                .format(x=x,y=y))
+
+    def assert_packset_contains_pair(self, packset, x, y):
+        for pack in packset.packs:
+            if isinstance(pack, Pair):
+                if pack.left.opidx == x and \
+                   pack.right.opidx == y:
+                    break
+        else:
+            pytest.fail("can't find a pack set for indices {x},{y}" \
+                            .format(x=x,y=y))
+    def assert_has_memory_ref_at(self, idx):
+        node = self.last_graph.nodes[idx]
+        assert node in self.last_graph.memory_refs, \
+            "operation %s at pos %d has no memory ref!" % \
+                (node.getoperation(), node.getindex())
+
+class BaseTestVectorize(VecTestHelper):
+
+    def test_vectorize_skip_impossible_1(self):
+        """ this trace does not contain a raw load / raw store from an array 
"""
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_le(i1, 10)
+        guard_true(i2) []
+        jump(p0,i1)
+        """
+        self.assert_vectorize(self.parse_loop(ops), self.parse_loop(ops))
+
+    def test_unroll_empty_stays_empty(self):
+        """ has no operations in this trace, thus it stays empty
+        after unrolling it 2 times """
+        ops = """
+        []
+        jump()
+        """
+        self.assert_unroll_loop_equals(self.parse_loop(ops), 
self.parse_loop(ops), 2)
+
+    def test_vectorize_empty_with_early_exit(self):
+        ops = """
+        []
+        jump()
+        """
+        try:
+            self.schedule(self.parse_loop(ops),1)
+            py.test.fail("empty loop with no memory references is not 
vectorizable")
+        except NotAVectorizeableLoop:
+            pass
+
+    def test_unroll_empty_stays_empty_parameter(self):
+        """ same as test_unroll_empty_stays_empty but with a parameter """
+        ops = """
+        [i0]
+        jump(i0)
+        """
+        self.assert_unroll_loop_equals(self.parse_loop(ops), 
self.parse_loop(ops), 2)
+
+    def test_vect_pointer_fails(self):
+        """ it currently rejects pointer arrays """
+        ops = """
+        [p0,i0]
+        raw_load_r(p0,i0,descr=arraydescr2)
+        jump(p0,i0)
+        """
+        self.assert_vectorize(self.parse_loop(ops), self.parse_loop(ops))
+
+    def test_load_primitive_python_list(self):
+        """ it currently rejects pointer arrays """
+        ops = """
+        [p0,i0]
+        i2 = getarrayitem_gc(p0,i0,descr=floatarraydescr)
+        i1 = int_add(i0,1)
+        i3 = getarrayitem_gc(p0,i1,descr=floatarraydescr)
+        i4 = int_add(i1,1)
+        jump(p0,i4)
+        """
+        opt = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_add(i0,2)
+        i3 = vec_getarrayitem_gc(p0,i0,2,descr=floatarraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectorize(self.parse_loop(ops),0)
+        self.assert_equal(vopt.loop, self.parse_loop(opt))
+
+    def test_vect_unroll_char(self):
+        """ a 16 byte vector register can hold 16 bytes thus 
+        it is unrolled 16 times. (it is the smallest type in the trace) """
+        ops = """
+        [p0,i0]
+        raw_load_i(p0,i0,descr=chararraydescr)
+        jump(p0,i0)
+        """
+        opt_ops = """
+        [p0,i0]
+        {}
+        jump(p0,i0)
+        """.format(('\n' + ' ' 
*8).join(['raw_load_i(p0,i0,descr=chararraydescr)'] * 16))
+        self.assert_unroll_loop_equals(self.parse_loop(ops), 
self.parse_loop(opt_ops))
+
+    def test_unroll_vector_addition(self):
+        """ a more complex trace doing vector addition (smallest type is float 
+        8 byte) """
+        ops = """
+        [p0,p1,p2,i0]
+        i1 = raw_load_i(p1, i0, descr=floatarraydescr)
+        i2 = raw_load_i(p2, i0, descr=floatarraydescr)
+        i3 = int_add(i1,i2)
+        raw_store(p0, i0, i3, descr=floatarraydescr)
+        i4 = int_add(i0, 1)
+        i5 = int_le(i4, 10)
+        guard_true(i5) []
+        jump(p0,p1,p2,i4)
+        """
+        opt_ops = """
+        [p0,p1,p2,i0]
+        i4 = int_add(i0, 1)
+        i5 = int_le(i4, 10)
+        guard_true(i5) []
+        i1 = raw_load_i(p1, i0, descr=floatarraydescr)
+        i2 = raw_load_i(p2, i0, descr=floatarraydescr)
+        i3 = int_add(i1,i2)
+        raw_store(p0, i0, i3, descr=floatarraydescr)
+        i9 = int_add(i4, 1)
+        i10 = int_le(i9, 10)
+        guard_true(i10) []
+        i6 = raw_load_i(p1, i4, descr=floatarraydescr)
+        i7 = raw_load_i(p2, i4, descr=floatarraydescr)
+        i8 = int_add(i6,i7)
+        raw_store(p0, i4, i8, descr=floatarraydescr)
+        jump(p0,p1,p2,i9)
+        """
+        self.assert_unroll_loop_equals(self.parse_loop(ops), 
self.parse_loop(opt_ops), 1)
+
+    def test_estimate_unroll_factor_smallest_byte_zero(self):
+        ops = """
+        [p0,i0]
+        raw_load_i(p0,i0,descr=arraydescr)
+        jump(p0,i0)
+        """
+        vopt = self.vectoroptimizer(self.parse_loop(ops))
+        assert 0 == vopt.smallest_type_bytes
+        assert 0 == vopt.get_unroll_count(ARCH_VEC_REG_SIZE)
+
+    def test_array_operation_indices_not_unrolled(self):
+        ops = """
+        [p0,i0]
+        raw_load_i(p0,i0,descr=arraydescr)
+        jump(p0,i0)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        assert len(vopt.dependency_graph.memory_refs) == 1
+        self.assert_has_memory_ref_at(1)
+
+    def test_array_operation_indices_unrolled_1(self):
+        ops = """
+        [p0,i0]
+        raw_load_i(p0,i0,descr=chararraydescr)
+        jump(p0,i0)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),1)
+        assert len(vopt.dependency_graph.memory_refs) == 2
+        self.assert_has_memory_ref_at(1)
+        self.assert_has_memory_ref_at(2)
+
+    def test_array_operation_indices_unrolled_2(self):
+        ops = """
+        [p0,i0,i1]
+        i3 = raw_load_i(p0,i0,descr=chararraydescr)
+        i4 = raw_load_i(p0,i1,descr=chararraydescr)
+        jump(p0,i3,i4)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.vectoroptimizer_unrolled(loop,0)
+        assert len(vopt.dependency_graph.memory_refs) == 2
+        self.assert_has_memory_ref_at(1)
+        self.assert_has_memory_ref_at(2)
+        #
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),1)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+        for i in [1,2,3,4]:
+            self.assert_has_memory_ref_at(i)
+        #
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),3)
+        assert len(vopt.dependency_graph.memory_refs) == 8
+        for i in [1,2,3,4,5,6,7,8]:
+            self.assert_has_memory_ref_at(i)
+
+    def test_array_memory_ref_adjacent_1(self):
+        ops = """
+        [p0,i0]
+        i3 = raw_load_i(p0,i0,descr=chararraydescr)
+        i1 = int_add(i0,1)
+        jump(p0,i1)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.vectoroptimizer_unrolled(loop,1)
+        vopt.find_adjacent_memory_refs()
+        assert len(vopt.dependency_graph.memory_refs) == 2
+
+        mref1 = self.getmemref(loop.find_first_index(rop.RAW_LOAD))
+        mref3 = self.getmemref(loop.find_first_index(rop.RAW_LOAD,1))
+        assert isinstance(mref1, MemoryRef)
+        assert isinstance(mref3, MemoryRef)
+
+        assert mref1.is_adjacent_to(mref3)
+        assert mref3.is_adjacent_to(mref1)
+
+    def test_array_memory_ref_1(self):
+        ops = """
+        [p0,i0]
+        i3 = raw_load_i(p0,i0,descr=chararraydescr)
+        jump(p0,i0)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(1)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 1
+        assert mref1.index_var.constant == 0
+
+    def test_array_memory_ref_2(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i3 = raw_load_i(p0,i1,descr=chararraydescr)
+        jump(p0,i1)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(2)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 1
+        assert mref1.index_var.constant == 1
+
+    def test_array_memory_ref_sub_index(self):
+        ops = """
+        [p0,i0]
+        i1 = int_sub(i0,1)
+        i3 = raw_load_i(p0,i1,descr=chararraydescr)
+        jump(p0,i1)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(2)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 1
+        assert mref1.index_var.constant == -1
+
+    def test_array_memory_ref_add_mul_index(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_mul(i1,3)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        jump(p0,i1)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(3)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 3
+        assert mref1.index_var.constant == 3
+
+    def test_array_memory_ref_add_mul_index_interleaved(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_mul(i1,3)
+        i3 = int_add(i2,5)
+        i4 = int_mul(i3,6)
+        i5 = raw_load_i(p0,i4,descr=chararraydescr)
+        jump(p0,i4)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(5)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 18
+        assert mref1.index_var.constant == 48
+
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_mul(i1,3)
+        i3 = int_add(i2,5)
+        i4 = int_mul(i3,6)
+        i5 = int_add(i4,30)
+        i6 = int_mul(i5,57)
+        i7 = raw_load_i(p0,i6,descr=chararraydescr)
+        jump(p0,i6)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(7)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 1026
+        assert mref1.index_var.coefficient_div == 1
+        assert mref1.index_var.constant == 57*(30) + 57*6*(5) + 57*6*3*(1)
+
+    def test_array_memory_ref_sub_mul_index_interleaved(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,1)
+        i2 = int_mul(i1,3)
+        i3 = int_sub(i2,3)
+        i4 = int_mul(i3,2)
+        i5 = raw_load_i(p0,i4,descr=chararraydescr)
+        jump(p0,i4)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref1 = self.getmemref(5)
+        assert isinstance(mref1, MemoryRef)
+        assert mref1.index_var.coefficient_mul == 6
+        assert mref1.index_var.coefficient_div == 1
+        assert mref1.index_var.constant == 0
+
+    def test_array_memory_ref_not_adjacent_1(self):
+        ops = """
+        [p0,i0,i4]
+        i3 = raw_load_i(p0,i0,descr=chararraydescr)
+        i1 = int_add(i0,1)
+        i5 = raw_load_i(p0,i4,descr=chararraydescr)
+        i6 = int_add(i4,1)
+        jump(p0,i1,i6)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.vectoroptimizer_unrolled(loop,1)
+        vopt.find_adjacent_memory_refs()
+
+        f = lambda x: loop.find_first_index(rop.RAW_LOAD, x)
+        indices = [f(0),f(1),f(2),f(3)]
+        for i in indices:
+            self.assert_has_memory_ref_at(i)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+
+        mref1, mref3, mref5, mref7 = [self.getmemref(i) for i in indices]
+        assert isinstance(mref1, MemoryRef)
+        assert isinstance(mref3, MemoryRef)
+        assert isinstance(mref5, MemoryRef)
+        assert isinstance(mref7, MemoryRef)
+
+        self.assert_memory_ref_adjacent(mref1, mref5)
+        self.assert_memory_ref_not_adjacent(mref1, mref3)
+        self.assert_memory_ref_not_adjacent(mref1, mref7)
+        self.assert_memory_ref_adjacent(mref3, mref7)
+        assert mref1.is_adjacent_after(mref5)
+
+    def test_array_memory_ref_div(self):
+        ops = """
+        [p0,i0]
+        i1 = int_floordiv(i0,2)
+        i2 = int_floordiv(i1,8)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref = self.getmemref(3)
+        assert mref.index_var.coefficient_div == 16
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,8)
+        i2 = uint_floordiv(i1,2)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref = self.getmemref(3)
+        assert mref.index_var.coefficient_div == 2
+        assert mref.index_var.constant == 4
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,8)
+        i2 = int_floordiv(i1,2)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        i4 = int_add(i0,4)
+        i5 = int_mul(i4,2)
+        i6 = raw_load_i(p0,i5,descr=chararraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref = self.getmemref(5)
+        mref2 = self.getmemref(6)
+
+        self.assert_memory_ref_not_adjacent(mref, mref2)
+        assert mref != mref2
+
+    def test_array_memory_ref_diff_calc_but_equal(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,4)
+        i2 = int_mul(i1,2)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        i4 = int_add(i0,2)
+        i5 = int_mul(i4,2)
+        i6 = int_add(i5,4)
+        i7 = raw_load_i(p0,i6,descr=chararraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref = self.getmemref(6)
+        mref2 = self.getmemref(7)
+
+        self.assert_memory_ref_not_adjacent(mref, mref2)
+        assert mref == mref2
+
+    def test_array_memory_ref_diff_not_equal(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0,4)
+        i2 = int_floordiv(i1,2)
+        i3 = raw_load_i(p0,i2,descr=chararraydescr)
+        i4 = int_add(i0,2)
+        i5 = int_mul(i4,2)
+        i6 = int_add(i5,4)
+        i7 = raw_load_i(p0,i6,descr=chararraydescr)
+        jump(p0,i2)
+        """
+        vopt = self.vectoroptimizer_unrolled(self.parse_loop(ops),0)
+        vopt.find_adjacent_memory_refs()
+        mref = self.getmemref(6)
+        mref2 = self.getmemref(7)
+
+        self.assert_memory_ref_not_adjacent(mref, mref2)
+        assert mref != mref2
+
+    def test_packset_init_simple(self):
+        ops = """
+        [p0,i0]
+        i3 = getarrayitem_raw(p0, i0, descr=chararraydescr)
+        i1 = int_add(i0, 1)
+        i2 = int_le(i1, 16)
+        guard_true(i2) [p0, i0]
+        jump(p0,i1)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.init_packset(loop,1)
+        self.assert_independent(4,8)
+        assert vopt.packset is not None
+        assert len(vopt.dependency_graph.memory_refs) == 2
+        assert len(vopt.packset.packs) == 1
+
+    def test_packset_init_raw_load_not_adjacent_and_adjacent(self):
+        ops = """
+        [p0,i0]
+        i3 = raw_load_i(p0, i0, descr=chararraydescr)
+        jump(p0,i0)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.init_packset(loop,3)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+        assert len(vopt.packset.packs) == 0
+        ops = """
+        [p0,i0]
+        i2 = int_add(i0,1)
+        raw_load_i(p0, i2, descr=chararraydescr)
+        jump(p0,i2)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.init_packset(loop,3)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+        assert len(vopt.packset.packs) == 3
+        for i in range(3):
+            x = (i+1)*2
+            y = x + 2
+            self.assert_independent(x,y)
+            self.assert_packset_contains_pair(vopt.packset, x,y)
+
+    def test_packset_init_2(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0, 1)
+        i2 = int_le(i1, 16)
+        guard_true(i2) [p0, i0]
+        i3 = getarrayitem_raw(p0, i1, descr=chararraydescr)
+        jump(p0,i1)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.init_packset(loop,15)
+        assert len(vopt.dependency_graph.memory_refs) == 16
+        assert len(vopt.packset.packs) == 15
+        # assure that memory refs are not adjacent for all
+        for i in range(15):
+            for j in range(15):
+                try:
+                    if i-4 == j or i+4 == j:
+                        mref1 = self.getmemref(i)
+                        mref2 = self.getmemref(j)
+                        assert mref1.is_adjacent_to(mref2)
+                    else:
+                        mref1 = self.getmemref(i)
+                        mref2 = self.getmemref(j)
+                        assert not mref1.is_adjacent_to(mref2)
+                except KeyError:
+                    pass
+        for i in range(15):
+            x = (i+1)*4
+            y = x + 4
+            self.assert_independent(x,y)
+            self.assert_packset_contains_pair(vopt.packset, x, y)
+
+    def test_isomorphic_operations(self):
+        ops_src = """
+        [p1,p0,i0]
+        i3 = getarrayitem_raw(p0, i0, descr=chararraydescr)
+        i1 = int_add(i0, 1)
+        i2 = int_le(i1, 16)
+        i4 = getarrayitem_raw(p0, i1, descr=chararraydescr)
+        i5 = getarrayitem_raw(p1, i1, descr=floatarraydescr)
+        i6 = getarrayitem_raw(p0, i1, descr=floatarraydescr)
+        guard_true(i2) [p0, i0]
+        jump(p1,p0,i1)
+        """
+        loop = self.parse_loop(ops_src)
+        ops = loop.operations
+        assert isomorphic(ops[1], ops[4])
+        assert not isomorphic(ops[0], ops[1])
+        assert not isomorphic(ops[0], ops[5])
+        # TODO strong assumptions do hold here?
+        #assert not isomorphic(ops[4], ops[5])
+        #assert not isomorphic(ops[5], ops[6])
+        #assert not isomorphic(ops[4], ops[6])
+        #assert not isomorphic(ops[1], ops[6])
+
+    def test_packset_extend_simple(self):
+        ops = """
+        [p0,i0]
+        i1 = int_add(i0, 1)
+        i2 = int_le(i1, 16)
+        guard_true(i2) [p0, i0]
+        i3 = getarrayitem_raw(p0, i1, descr=chararraydescr)
+        i4 = int_add(i3, 1)
+        jump(p0,i1)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.extend_packset(loop,1)
+        assert len(vopt.dependency_graph.memory_refs) == 2
+        self.assert_independent(5,10)
+        assert len(vopt.packset.packs) == 2
+        self.assert_packset_empty(vopt.packset, len(loop.operations),
+                                  [(5,10), (4,9)])
+
+    def test_packset_extend_load_modify_store(self):
+        ops = """
+        [p0,i0]
+        guard_early_exit() []
+        i1 = int_add(i0, 1)
+        i2 = int_le(i1, 16)
+        guard_true(i2) [p0, i0]
+        i3 = getarrayitem_raw(p0, i1, descr=chararraydescr)
+        i4 = int_mul(i3, 2)
+        setarrayitem_raw(p0, i1, i4, descr=chararraydescr)
+        jump(p0,i1)
+        """
+        loop = self.parse_loop(ops)
+        vopt = self.extend_packset(loop,1)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+        self.assert_independent(4,10)
+        self.assert_independent(5,11)
+        self.assert_independent(6,12)
+        assert len(vopt.packset.packs) == 3
+        self.assert_packset_empty(vopt.packset, len(loop.operations),
+                                  [(6,12), (5,11), (4,10)])
+
+    @pytest.mark.parametrize("descr,packs,packidx", 
+                             [('char',1,  [(0,(2,4,6,8))]),
+                              ('float',2, [(0,(2,4)),(1,(6,8))]),
+                              ('int',2,   [(0,(2,4)),(1,(6,8))]),
+                              ('singlefloat',1,[(0,(2,4,6,8))])])
+    def test_packset_combine_simple(self,descr,packs,packidx):
+        ops = """
+        [p0,i0]
+        i3 = getarrayitem_raw(p0, i0, descr={descr}arraydescr)
+        i1 = int_add(i0,1)
+        jump(p0,i1)
+        """.format(descr=descr)
+        loop = self.parse_loop(ops)
+        vopt = self.combine_packset(loop,3)
+        assert len(vopt.dependency_graph.memory_refs) == 4
+        assert len(vopt.packset.packs) == packs
+        for i,t in packidx:
+            self.assert_pack(vopt.packset.packs[i], t)
+
+    @pytest.mark.parametrize("descr,stride,packs,suffix",
+            
[('char',1,1,'_i'),('float',8,4,'_f'),('int',8,4,'_i'),('float32',4,2,'_i')])
+    def test_packset_combine_2_loads_in_trace(self, descr, stride,packs):
+        ops = """
+        [p0,i0]
+        i3 = raw_load{suffix}(p0, i0, descr={type}arraydescr)
+        i1 = int_add(i0,{stride})
+        i4 = raw_load{suffix}(p0, i1, descr={type}arraydescr)
+        i2 = int_add(i1,{stride})
+        jump(p0,i2)
+        """.format(type=descr,stride=stride,suffix=suffix)
+        loop = self.parse_loop(ops)
+        vopt = self.combine_packset(loop,3)
+        assert len(vopt.dependency_graph.memory_refs) == 8
+        assert len(vopt.packset.packs) == packs
+
+    def test_packset_combine_no_candidates_packset_empty(self):
+        ops = """
+        []
+        jump()
+        """
+        try:
+            self.combine_packset(self.parse_loop(ops),15)
+            pytest.fail("combine should raise an exception if no pack "
+                        "statements are present")
+        except NotAVectorizeableLoop:
+            pass
+
+        ops = """
+        [p0,i0]
+        i3 = getarrayitem_raw(p0, i0, descr=floatarraydescr)
+        jump(p0,i3)
+        """
+        try:
+            loop = self.parse_loop(ops)
+            self.combine_packset(loop,15)
+        except NotAVectorizeableLoop:
+            pass
+
+    @pytest.mark.parametrize("op,descr,stride",
+            [('int_add','char',1),
+             ('int_sub','char',1),
+             ('int_mul','char',1),
+             ('float_add','float',8),
+             ('float_sub','float',8),
+             ('float_mul','float',8),
+             ('float_add','singlefloat',4),
+             ('float_sub','singlefloat',4),
+             ('float_mul','singlefloat',4),
+             ('int_add','int',8),
+             ('int_sub','int',8),
+             ('int_mul','int',8),
+            ])
+    def test_packset_vector_operation(self, op, descr, stride):
+        ops = """
+        [p0,p1,p2,i0]
+        guard_early_exit() []
+        i1 = int_add(i0, {stride})
+        i10 = int_le(i1, 128)
+        guard_true(i10) []
+        i2 = raw_load{suffix}(p0, i0, descr={descr}arraydescr)
+        i3 = raw_load{suffix}(p1, i0, descr={descr}arraydescr)
+        i4 = {op}(i2,i3)
+        raw_store(p2, i0, i4, descr={descr}arraydescr)
+        jump(p0,p1,p2,i1)
+        """.format(op=op,descr=descr,stride=stride)
+        loop = self.parse_loop(ops)
+        vopt = self.combine_packset(loop,3)
+        assert len(vopt.dependency_graph.memory_refs) == 12
+        if stride == 8:
+            assert len(vopt.packset.packs) == 8
+        else:
+            assert len(vopt.packset.packs) == 4
+
+        for opindices in [(5,12,19,26),(6,13,20,27),
+                          (7,14,21,28),(4,11,18,25)]:
+            self.assert_has_pack_with(vopt.packset, opindices)
+
+    @pytest.mark.parametrize('op,descr,stride',
+            [('float_add','float',8),
+             ('float_sub','float',8),
+             ('float_mul','float',8),
+             ('int_add','int',8),
+             ('int_sub','int',8),
+            ])
+    def test_schedule_vector_operation(self, op, descr, stride):
+        ops = """
+        [p0,p1,p2,i0] # 0
+        guard_early_exit() []
+        i10 = int_le(i0, 128)  # 1, 8, 15, 22
+        guard_true(i10) [p0,p1,p2,i0] # 2, 9, 16, 23
+        i2 = getarrayitem_raw(p0, i0, descr={descr}arraydescr) # 3, 10, 17, 24
+        i3 = getarrayitem_raw(p1, i0, descr={descr}arraydescr) # 4, 11, 18, 25
+        i4 = {op}(i2,i3) # 5, 12, 19, 26
+        setarrayitem_raw(p2, i0, i4, descr={descr}arraydescr) # 6, 13, 20, 27
+        i1 = int_add(i0, {stride}) # 7, 14, 21, 28
+        jump(p0,p1,p2,i1) # 29
+        """.format(op=op,descr=descr,stride=1) # stride getarray is always 1
+        vops = """
+        [p0,p1,p2,i0]
+        i10 = int_le(i0, 128)
+        guard_true(i10) []
+        i1 = int_add(i0, {stride})
+        i11 = int_le(i1, 128)
+        guard_true(i11) []
+        i12 = int_add(i1, {stride})
+        v1 = vec_getarrayitem_raw(p0, i0, 2, descr={descr}arraydescr)
+        v2 = vec_getarrayitem_raw(p1, i0, 2, descr={descr}arraydescr)
+        v3 = {op}(v1,v2)
+        vec_setarrayitem_raw(p2, i0, v3, descr={descr}arraydescr)
+        jump(p0,p1,p2,i12)
+        """.format(op='vec_'+op,descr=descr,stride=1)
+        loop = self.parse_loop(ops)
+        vopt = self.schedule(loop, 1)
+        self.assert_equal(loop, self.parse_loop(vops))
+
+    def test_vschedule_trace_1(self):
+        ops = """
+        [i0, i1, i2, i3, i4]
+        guard_early_exit() []
+        i6 = int_mul(i0, 8)
+        i7 = raw_load(i2, i6, descr=arraydescr)
+        i8 = raw_load(i3, i6, descr=arraydescr)
+        i9 = int_add(i7, i8)
+        raw_store(i4, i6, i9, descr=arraydescr)
+        i11 = int_add(i0, 1)
+        i12 = int_lt(i11, i1)
+        guard_true(i12) [i4, i3, i2, i1, i11]
+        jump(i11, i1, i2, i3, i4)
+        """
+        opt="""
+        [i0, i1, i2, i3, i4]
+        i11 = int_add(i0, 1) 
+        i6 = int_mul(i0, 8) 
+        i12 = int_lt(i11, i1) 
+        guard_true(i12) []
+        i13 = int_add(i11, 1) 
+        i14 = int_mul(i11, 8) 
+        i18 = int_lt(i13, i1) 
+        guard_true(i18) []
+        v19 = vec_raw_load(i2, i6, 2, descr=arraydescr) 
+        v20 = vec_raw_load(i3, i6, 2, descr=arraydescr) 
+        v21 = vec_int_add(v19, v20) 
+        vec_raw_store(i4, i6, v21, descr=arraydescr) 
+        jump(i13, i1, i2, i3, i4)
+        """
+        vopt = self.schedule(self.parse_loop(ops),1)
+        self.assert_equal(vopt.loop, self.parse_loop(opt))
+
+    def test_collapse_index_guard_1(self):
+        ops = """
+        [p0,i0]
+        guard_early_exit() [p0,i0]
+        i1 = getarrayitem_raw(p0, i0, descr=chararraydescr)
+        i2 = int_add(i0, 1)
+        i3 = int_lt(i2, 102)
+        guard_true(i3) [p0,i0]
+        jump(p0,i2)
+        """
+        dead_code =  '\n        '.join([
+          "i{t1} = int_add(i0,{i})\n        i{s} = int_lt(i{t1}, 102)".format(
+              i=i+2, t1=i+201, t=i+200, s=i+20)
+          for i in range(0,14)])
+        opt="""
+        [p0,i0]
+        i200 = int_add(i0, 1)
+        i400 = int_lt(i200, 102)
+        i2 = int_add(i0, 16)
+        i3 = int_lt(i2, 102)
+        guard_true(i3) [p0,i0]
+        {dead_code}
+        i500 = int_add(i0, 16)
+        i501 = int_lt(i2, 102)
+        i1 = vec_getarrayitem_raw(p0, i0, 16, descr=chararraydescr)
+        jump(p0,i2)
+        """.format(dead_code=dead_code)
+        vopt = self.schedule(self.parse_loop(ops),15,with_guard_opt=True)
+        self.assert_equal(vopt.loop, self.parse_loop(opt))
+
+    def test_too_small_vector(self):
+        ops = """
+        [p0,i0]
+        guard_early_exit() [p0,i0]
+        i1 = getarrayitem_raw(p0, 0, descr=chararraydescr) # constant index
+        i2 = getarrayitem_raw(p0, 1, descr=chararraydescr) # constant index
+        i4 = int_add(i1, i2)
+        i3 = int_add(i0,1)
+        i5 = int_lt(i3, 10)
+        guard_true(i5) [p0, i0]
+        jump(p0,i1)
+        """
+        try:
+            self.vectorize(self.parse_loop(ops))
+            py.test.fail("loop is not vectorizable")
+        except NotAVectorizeableLoop:
+            pass
+
+    def test_constant_expansion(self):
+        ops = """
+        [p0,i0]
+        guard_early_exit() [p0,i0]
+        i1 = getarrayitem_raw(p0, i0, descr=floatarraydescr)
+        i4 = int_sub(i1, 42)
+        i3 = int_add(i0,1)
+        i5 = int_lt(i3, 10)
+        guard_true(i5) [p0, i0]
+        jump(p0,i3)
+        """
+        opt="""
+        [p0,i0]
+        label(p0,i0)
+        v3 = vec_int_expand(42, 2)
+        label(p0,i0,v3)
+        i20 = int_add(i0, 1)
+        i30 = int_lt(i20, 10)
+        i2 = int_add(i0, 2)
+        i3 = int_lt(i2, 10)
+        guard_true(i3) [p0,i0]
+        i4 = int_add(i0, 2)
+        i5 = int_lt(i2, 10)
+        v1 = vec_getarrayitem_raw(p0, i0, 2, descr=floatarraydescr)
+        v2 = vec_int_sub(v1, v3)
+        jump(p0,i2,v3)
+        """
+        vopt = self.vectorize(self.parse_loop(ops),1)
+        self.assert_equal(vopt.loop, self.parse_loop(opt,add_label=False))
+
+    def test_variable_expansion(self):
+        ops = """
+        [p0,i0,f3]
+        guard_early_exit() [p0,i0]
+        f1 = getarrayitem_raw(p0, i0, descr=floatarraydescr)
+        f4 = int_add(f1, f3)
+        i3 = int_add(i0,1)
+        i5 = int_lt(i3, 10)
+        guard_true(i5) [p0, i0]
+        jump(p0,i3,f3)
+        """
+        opt="""
+        [p0,i0,f3]
+        label(p0,i0,f3)
+        v3 = vec_float_expand(f3,2)
+        label(p0,i0,f3,v3)
+        i20 = int_add(i0, 1)
+        i30 = int_lt(i20, 10)
+        i2 = int_add(i0, 2)
+        i3 = int_lt(i2, 10)
+        guard_true(i3) [p0,i0,f3]
+        i4 = int_add(i0, 2)
+        i5 = int_lt(i2, 10)
+        v1 = vec_getarrayitem_raw(p0, i0, 2, descr=floatarraydescr)
+        v2 = vec_int_add(v1, v3)
+        jump(p0,i2,f3,v3)
+        """
+        vopt = self.vectorize(self.parse_loop(ops),1)
+        self.assert_equal(vopt.loop, self.parse_loop(opt, add_label=False))
+
+    def test_accumulate_basic(self):
+        trace = """
+        [p0, i0, f0]
+        guard_early_exit() [p0, i0, f0]
+        f1 = raw_load(p0, i0, descr=floatarraydescr)
+        f2 = float_add(f0, f1)
+        i1 = int_add(i0, 8)
+        i2 = int_lt(i1, 100)
+        guard_false(i2) [p0, i0, f2]
+        jump(p0, i1, f2)
+        """
+        trace_opt = """
+        [p0, i0, v2[f64|2]]
+        i1 = int_add(i0, 16)
+        i2 = int_lt(i1, 100)
+        guard_false(i2) [p0, i0, v[f64|2]]
+        i10 = int_add(i0, 16)
+        i20 = int_lt(i10, 100)
+        v1[f64|2] = vec_raw_load(p0, i0, 2, descr=floatarraydescr)
+        v3[f64|2] = vec_float_add(v2[f64|2], v1[f64|2])
+        jump(p0, i1, v3[f64|2])
+        """
+        opt = self.vectorize(self.parse_loop(trace))
+        assert len(opt.packset.accum_vars) == 1
+        assert opt.loop.inputargs[2] in opt.packset.accum_vars
+        self.debug_print_operations(opt.loop)
+
+    def test_element_f45_in_guard_failargs(self):
+        ops = """
+        [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, i40, i41, p42, i43, 
i44, i21, i4, i0, i18]
+        guard_early_exit() [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42, 
i43, f34, i28, p36, i41]
+        f45 = raw_load(i21, i44, descr=floatarraydescr) 
+        guard_not_invalidated() [p38, p12, p9, p14, f45, p39, i37, i44, f35, 
i40, p42, i43, None, i28, p36, i41]
+        i46 = int_add(i44, 8) 
+        f47 = raw_load(i4, i41, descr=floatarraydescr) 
+        i48 = int_add(i41, 8) 
+        f49 = float_add(f45, f47)
+        raw_store(i0, i37, f49, descr=floatarraydescr)
+        i50 = int_add(i28, 1)
+        i51 = int_add(i37, 8)
+        i52 = int_ge(i50, i18) 
+        guard_false(i52) [p38, p12, p9, p14, i48, i46, f47, i51, i50, f45, 
p39, None, None, None, i40, p42, i43, None, None, p36, None]
+        jump(p36, i50, p9, i51, p14, f45, p12, p38, f47, p39, i40, i48, p42, 
i43, i46, i21, i4, i0, i18)
+        """
+        opt = """
+        [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, i40, i41, p42, i43, 
i44, i21, i4, i0, i18]
+        guard_not_invalidated() [p38, p12, p9, p14, p39, i37, i44, f35, i40, 
p42, i43, f34, i28, p36, i41]
+        i50 = int_add(i28, 1) 
+        i48 = int_add(i41, 8) 
+        i51 = int_add(i37, 8) 
+        i54 = int_add(i41, 16) 
+        i46 = int_add(i44, 8) 
+        i56 = int_add(i37, 16) 
+        i52 = int_ge(i50, i18) 
+        i637 = int_add(i28, 2)
+        i638 = int_ge(i637, i18)
+        guard_false(i638) [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, 
i40, i41, p42, i43, i44, i21, i4, i0, i18]
+        i55 = int_add(i44, 16) 
+        i629 = int_add(i28, 2)
+        i57 = int_ge(i637, i18) 
+        v61 = vec_raw_load(i21, i44, 2, descr=floatarraydescr) 
+        v62 = vec_raw_load(i4, i41, 2, descr=floatarraydescr) 
+        v63 = vec_float_add(v61, v62) 
+        vec_raw_store(i0, i37, v63, descr=floatarraydescr) 
+        f100 = vec_float_unpack(v61, 1, 1)
+        f101 = vec_float_unpack(v62, 1, 1)
+        jump(p36, i637, p9, i56, p14, f100, p12, p38, f101, p39, i40, i54, 
p42, i43, i55, i21, i4, i0, i18)
+        """
+        vopt = self.vectorize(self.parse_loop(ops))
+        self.assert_equal(vopt.loop, self.parse_loop(opt))
+
+    def test_shrink_vector_size(self):
+        ops = """
+        [p0,p1,i1]
+        guard_early_exit() []
+        f1 = getarrayitem_raw(p0, i1, descr=floatarraydescr)
+        i2 = cast_float_to_singlefloat(f1)
+        setarrayitem_raw(p1, i1, i2, descr=singlefloatarraydescr)
+        i3 = int_add(i1, 1)
+        i4 = int_ge(i3, 36)
+        guard_false(i4) []
+        jump(p0, p1, i3)
+        """
+        opt = """
+        [p0, p1, i1]
+        i3 = int_add(i1, 1)
+        i4 = int_ge(i3, 36)
+        i50 = int_add(i1, 4)
+        i51 = int_ge(i50, 36)
+        guard_false(i51) [p0, p1, i1]
+        i5 = int_add(i1, 2)
+        i8 = int_ge(i5, 36)
+        i6 = int_add(i1, 3)
+        i11 = int_ge(i6, 36)
+        i7 = int_add(i1, 4)
+        i14 = int_ge(i50, 36)
+        v17 = vec_getarrayitem_raw(p0, i1, 2, descr=floatarraydescr)
+        v19 = vec_cast_float_to_singlefloat(v17)
+        v18 = vec_getarrayitem_raw(p0, i5, 2, descr=floatarraydescr)
+        v20 = vec_cast_float_to_singlefloat(v18)
+        v21 = vec_float_pack(v19, v20, 2, 2)
+        vec_setarrayitem_raw(p1, i1, v21, descr=singlefloatarraydescr)
+        jump(p0, p1, i50)
+        """
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to