[pypy-commit] pypy vecopt2: vectorization now uses the preamble of the unrolling optimization, this is a step towards a unified unrolling algorithm (and keeps most of the variables in the register)

plan_rich Tue, 05 May 2015 00:48:04 -0700

Author: Richard Plangger <r...@pasra.at>
Branch: vecopt2
Changeset: r77126:c7cbb61784d6
Date: 2015-04-29 15:11 +0200
http://bitbucket.org/pypy/pypy/changeset/c7cbb61784d6/


Log:    vectorization now uses the preamble of the unrolling optimization,
        this is a step towards a unified unrolling algorithm (and keeps most
        of the variables in the register) some test changes that where
        needed after the small trace_optimize refactoring

diff --git a/rpython/jit/backend/llgraph/runner.py 
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -242,6 +242,8 @@
     translate_support_code = False
     is_llgraph = True
 
+    vector_register_size = 16
+
     def __init__(self, rtyper, stats=None, *ignored_args, **kwds):
         model.AbstractCPU.__init__(self)
         self.rtyper = rtyper
diff --git a/rpython/jit/backend/x86/detect_sse2.py 
b/rpython/jit/backend/x86/detect_sse2.py
--- a/rpython/jit/backend/x86/detect_sse2.py
+++ b/rpython/jit/backend/x86/detect_sse2.py
@@ -2,35 +2,42 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rlib.rmmap import alloc, free
 
-
-def detect_sse2():
+def cpu_info(instr):
     data = alloc(4096)
     pos = 0
-    for c in ("\xB8\x01\x00\x00\x00"     # MOV EAX, 1
-              "\x53"                     # PUSH EBX
-              "\x0F\xA2"                 # CPUID
-              "\x5B"                     # POP EBX
-              "\x92"                     # XCHG EAX, EDX
-              "\xC3"):                   # RET
+    for c in instr:
         data[pos] = c
         pos += 1
     fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data)
     code = fnptr()
     free(data, 4096)
+    return code
+
+def detect_sse2():
+    code = cpu_info("\xB8\x01\x00\x00\x00"     # MOV EAX, 1
+                    "\x53"                     # PUSH EBX
+                    "\x0F\xA2"                 # CPUID
+                    "\x5B"                     # POP EBX
+                    "\x92"                     # XCHG EAX, EDX
+                    "\xC3"                     # RET
+                   )
     return bool(code & (1<<25)) and bool(code & (1<<26))
 
+def byte_size_for_vector_registers(sse2, avx, avxbw):
+    if avx:
+        if avxbw:
+            return 64
+        return 32
+    if sse2:
+        return 16
+    assert False, "No vector extention supported"
+
 def detect_x32_mode():
-    data = alloc(4096)
-    pos = 0                         # 32-bit         64-bit / x32
-    for c in ("\x48"                # DEC EAX
-              "\xB8\xC8\x00\x00\x00"# MOV EAX, 200   MOV RAX, 
0x40404040000000C8
-              "\x40\x40\x40\x40"    # 4x INC EAX
-              "\xC3"):              # RET            RET
-        data[pos] = c
-        pos += 1
-    fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data)
-    code = fnptr()
-    free(data, 4096)
+    # 32-bit         64-bit / x32
+    code = cpuinfo("\x48"                # DEC EAX
+                   "\xB8\xC8\x00\x00\x00"# MOV EAX, 200   MOV RAX, 
0x40404040000000C8
+                   "\x40\x40\x40\x40"    # 4x INC EAX
+                   "\xC3")               # RET            RET
     assert code in (200, 204, 0x40404040000000C8)
     return code == 200
 
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1301,6 +1301,7 @@
             else:
                 src_locations2.append(src_loc)
                 dst_locations2.append(dst_loc)
+
         # Do we have a temp var?
         if IS_X86_64:
             tmpreg = X86_64_SCRATCH_REG
@@ -1466,7 +1467,7 @@
                not descr.is_array_of_structs()
         itemsize, ofs, _ = unpack_arraydescr(descr)
         integer = not descr.is_array_of_floats()
-        aligned = True
+        aligned = False
         args = op.getarglist()
         base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
         ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
@@ -1487,7 +1488,7 @@
         ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
 
         integer = not descr.is_array_of_floats()
-        aligned = True
+        aligned = False
         self.perform_discard(op, [base_loc, ofs_loc, value_loc,
                                  imm(itemsize), imm(ofs), imm(integer), 
imm(aligned)])
 
diff --git a/rpython/jit/backend/x86/runner.py 
b/rpython/jit/backend/x86/runner.py
--- a/rpython/jit/backend/x86/runner.py
+++ b/rpython/jit/backend/x86/runner.py
@@ -24,6 +24,8 @@
     with_threads = False
     frame_reg = regloc.ebp
 
+    vector_register_size = 0 # in bytes
+
     from rpython.jit.backend.x86.arch import JITFRAME_FIXED_SIZE
     all_reg_indexes = gpr_reg_mgr_cls.all_reg_indexes
     gen_regs = gpr_reg_mgr_cls.all_regs
@@ -148,6 +150,8 @@
 
     IS_64_BIT = False
 
+    vector_register_size = 16
+
     def __init__(self, *args, **kwargs):
         assert sys.maxint == (2**31 - 1)
         super(CPU386, self).__init__(*args, **kwargs)
@@ -163,4 +167,6 @@
 
     IS_64_BIT = True
 
+    vector_register_size = 16
+
 CPU = CPU386
diff --git a/rpython/jit/backend/x86/test/test_vectorize.py 
b/rpython/jit/backend/x86/test/test_vectorize.py
--- a/rpython/jit/backend/x86/test/test_vectorize.py
+++ b/rpython/jit/backend/x86/test/test_vectorize.py
@@ -11,9 +11,11 @@
 from rpython.rtyper.lltypesystem import lltype
 
 
-class TestBasic(test_basic.Jit386Mixin, test_vectorize.VectorizeLLtypeTests):
+class TestBasic(test_vectorize.VectorizeLLtypeTests, test_basic.Jit386Mixin):
     # for the individual tests see
     # ====> ../../../metainterp/test/test_basic.py
+    enable_opts = 
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
+
     pass
 
 
diff --git a/rpython/jit/metainterp/optimizeopt/__init__.py 
b/rpython/jit/metainterp/optimizeopt/__init__.py
--- a/rpython/jit/metainterp/optimizeopt/__init__.py
+++ b/rpython/jit/metainterp/optimizeopt/__init__.py
@@ -67,13 +67,13 @@
         loop.logops = metainterp_sd.logger_noopt.log_loop(loop.inputargs,
                                                           loop.operations)
         optimizations, unroll = build_opt_chain(metainterp_sd, enable_opts)
-
-        if jitdriver_sd.vectorize:
-            optimize_vector(metainterp_sd, jitdriver_sd, loop, optimizations)
-        elif unroll:
-            return optimize_unroll(metainterp_sd, jitdriver_sd, loop,
-                                   optimizations, inline_short_preamble,
-                                   start_state, export_state)
+        if unroll:
+            if not export_state and warmstate.vectorize and 
jitdriver_sd.vectorize:
+                optimize_vector(metainterp_sd, jitdriver_sd, loop, 
optimizations)
+            else:
+                return optimize_unroll(metainterp_sd, jitdriver_sd, loop,
+                                       optimizations, inline_short_preamble,
+                                       start_state, export_state)
         else:
             optimizer = Optimizer(metainterp_sd, jitdriver_sd, loop,
                                   optimizations)
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py 
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -12,7 +12,7 @@
 from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph
 from rpython.jit.metainterp.optimizeopt.unroll import Inliner
 from rpython.jit.metainterp.optimizeopt.vectorize import 
(VectorizingOptimizer, MemoryRef,
-        isomorphic, Pair, NotAVectorizeableLoop)
+        isomorphic, Pair, NotAVectorizeableLoop, NotAVectorizeableLoop)
 from rpython.jit.metainterp.optimize import InvalidLoop
 from rpython.jit.metainterp.history import ConstInt, BoxInt, 
get_const_ptr_for_string
 from rpython.jit.metainterp import executor, compile, resume
@@ -22,6 +22,8 @@
 class FakeJitDriverStaticData(object):
     vectorize=True
 
+ARCH_VEC_REG_SIZE = 16
+
 class VecTestHelper(DependencyBaseTest):
 
     enable_opts = 
"intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unfold"
@@ -54,7 +56,7 @@
         if unroll_factor == -1 and opt.smallest_type_bytes == 0:
             raise NotAVectorizeableLoop()
         if unroll_factor == -1:
-            unroll_factor = opt.get_unroll_count()
+            unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE)
         opt.unroll_loop_iterations(loop, unroll_factor)
         opt.loop.operations = opt.get_newoperations()
         opt.clear_newoperations()
@@ -164,6 +166,18 @@
         """
         self.assert_unroll_loop_equals(self.parse_loop(ops), 
self.parse_loop(ops), 2)
 
+    def test_vectorize_empty_with_early_exit(self):
+        ops = """
+        []
+        guard_early_exit() []
+        jump()
+        """
+        try:
+            self.schedule(self.parse_loop(ops),1)
+            py.test.fail("empty loop with no memory references is not 
vectorizable")
+        except NotAVectorizeableLoop:
+            pass
+
     def test_unroll_empty_stays_empty_parameter(self):
         """ same as test_unroll_empty_stays_empty but with a parameter """
         ops = """
@@ -238,7 +252,7 @@
         """
         vopt = self.vectoroptimizer(self.parse_loop(ops))
         assert 0 == vopt.smallest_type_bytes
-        assert 0 == vopt.get_unroll_count()
+        assert 0 == vopt.get_unroll_count(ARCH_VEC_REG_SIZE)
 
     def test_array_operation_indices_not_unrolled(self):
         ops = """
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py 
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -71,20 +71,22 @@
         self.clear_newoperations()
         label = self.loop.operations[0]
         jump = self.loop.operations[-1]
-        if jump.getopnum() != rop.LABEL:
+        if jump.getopnum() not in (rop.LABEL, rop.JUMP):
             # compile_loop appends a additional label to all loops
             # we cannot optimize normal traces
+            assert False
             raise NotAVectorizeableLoop()
 
         self.linear_find_smallest_type(self.loop)
         byte_count = self.smallest_type_bytes
-        if byte_count == 0 or label.getopnum() != rop.LABEL:
+        vsize = self.metainterp_sd.cpu.vector_register_size
+        if vsize == 0 or byte_count == 0 or label.getopnum() != rop.LABEL:
             # stop, there is no chance to vectorize this trace
             # we cannot optimize normal traces (if there is no label)
             raise NotAVectorizeableLoop()
 
         # unroll
-        self.unroll_count = self.get_unroll_count()
+        self.unroll_count = self.get_unroll_count(vsize)
         self.unroll_loop_iterations(self.loop, self.unroll_count)
         self.loop.operations = self.get_newoperations();
         self.clear_newoperations();
@@ -97,6 +99,8 @@
         self.schedule()
 
     def emit_operation(self, op):
+        if op.getopnum() == rop.GUARD_EARLY_EXIT:
+            return
         self._last_emitted_op = op
         self._newoperations.append(op)
 
@@ -111,10 +115,15 @@
         op_count = len(loop.operations)
 
         label_op = loop.operations[0].clone()
-        jump_op = loop.operations[op_count-1].clone()
+        assert label_op.getopnum() == rop.LABEL
+        jump_op = loop.operations[op_count-1]
         # use the target token of the label
-        jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, 
label_op.getdescr())
-        assert label_op.getopnum() == rop.LABEL
+        assert jump_op.getopnum() in (rop.LABEL, rop.JUMP)
+        if jump_op.getopnum() == rop.LABEL:
+            jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, 
label_op.getdescr())
+        else:
+            jump_op = jump_op.clone()
+            jump_op.setdescr(label_op.getdescr())
         assert jump_op.is_final()
 
         self.emit_unrolled_operation(label_op)
@@ -228,13 +237,12 @@
                        or byte_count < self.smallest_type_bytes:
                         self.smallest_type_bytes = byte_count
 
-    def get_unroll_count(self):
+    def get_unroll_count(self, simd_vec_reg_bytes):
         """ This is an estimated number of further unrolls """
         # this optimization is not opaque, and needs info about the CPU
         byte_count = self.smallest_type_bytes
         if byte_count == 0:
             return 0
-        simd_vec_reg_bytes = 16 # TODO get from cpu
         unroll_count = simd_vec_reg_bytes // byte_count
         return unroll_count-1 # it is already unrolled once
 
@@ -357,7 +365,9 @@
         if not we_are_translated():
             for node in self.dependency_graph.nodes:
                 assert node.emitted
-        self.loop.operations = self.collapse_index_guards()
+        self.loop.operations = self._newoperations[:]
+        #self.collapse_index_guards()
+        #self.clear_newoperations()
 
     def relax_index_guards(self):
         label_idx = 0
diff --git a/rpython/jit/metainterp/pyjitpl.py 
b/rpython/jit/metainterp/pyjitpl.py
--- a/rpython/jit/metainterp/pyjitpl.py
+++ b/rpython/jit/metainterp/pyjitpl.py
@@ -2135,8 +2135,10 @@
         self.seen_loop_header_for_jdindex = -1
         # can only emit early exit if liveness is present
         # TODO think of a better way later
-        if self.framestack[-1].jitcode.liveness.get(0, None):
+        if self.framestack[-1].jitcode.liveness.get(0, None) \
+           and self.jitdriver_sd.vectorize:
             self.generate_guard(rop.GUARD_EARLY_EXIT)
+            #self.history.record(rop.GUARD_EARLY_EXIT, [], None)
         try:
             self.interpret()
         except SwitchToBlackhole, stb:
diff --git a/rpython/jit/metainterp/test/support.py 
b/rpython/jit/metainterp/test/support.py
--- a/rpython/jit/metainterp/test/support.py
+++ b/rpython/jit/metainterp/test/support.py
@@ -48,6 +48,7 @@
 
         trace_limit = sys.maxint
         enable_opts = ALL_OPTS_DICT
+        vectorize = True
 
     if kwds.pop('disable_optimizations', False):
         FakeWarmRunnerState.enable_opts = {}
diff --git a/rpython/jit/metainterp/test/test_ajit.py 
b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -2764,9 +2764,13 @@
             return i
         #
         seen = []
-        def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, enable_opts,
+        def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, warmstate,
                               *args, **kwds):
-            seen.append('unroll' in enable_opts)
+            if 'try_disabling_unroll' in kwds and \
+               kwds['try_disabling_unroll']:
+                seen.append(False)
+            else:
+                seen.append('unroll' in warmstate.enable_opts)
             raise InvalidLoop
         old_optimize_trace = optimizeopt.optimize_trace
         optimizeopt.optimize_trace = my_optimize_trace
diff --git a/rpython/jit/metainterp/test/test_vectorize.py 
b/rpython/jit/metainterp/test/test_vectorize.py
--- a/rpython/jit/metainterp/test/test_vectorize.py
+++ b/rpython/jit/metainterp/test/test_vectorize.py
@@ -13,13 +13,14 @@
                                      free_raw_storage, raw_storage_getitem)
 
 class VectorizeTests:
-    enable_opts = 'all'
+    enable_opts = 
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
 
     def meta_interp(self, f, args, policy=None):
         return ll_meta_interp(f, args, enable_opts=self.enable_opts,
                               policy=policy,
                               CPUClass=self.CPUClass,
-                              type_system=self.type_system)
+                              type_system=self.type_system,
+                              vectorize=1)
 
     @py.test.mark.parametrize('i',[3,4,5,6,7,8,9,50])
     def test_vectorize_simple_load_arith_store_int_add_index(self,i):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy vecopt2: vectorization now uses the preamble of the unrolling optimization, this is a step towards a unified unrolling algorithm (and keeps most of the variables in the register)

Reply via email to