Author: Richard Plangger <r...@pasra.at> Branch: vecopt2 Changeset: r77126:c7cbb61784d6 Date: 2015-04-29 15:11 +0200 http://bitbucket.org/pypy/pypy/changeset/c7cbb61784d6/
Log: vectorization now uses the preamble of the unrolling optimization, this is a step towards a unified unrolling algorithm (and keeps most of the variables in the register) some test changes that where needed after the small trace_optimize refactoring diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py --- a/rpython/jit/backend/llgraph/runner.py +++ b/rpython/jit/backend/llgraph/runner.py @@ -242,6 +242,8 @@ translate_support_code = False is_llgraph = True + vector_register_size = 16 + def __init__(self, rtyper, stats=None, *ignored_args, **kwds): model.AbstractCPU.__init__(self) self.rtyper = rtyper diff --git a/rpython/jit/backend/x86/detect_sse2.py b/rpython/jit/backend/x86/detect_sse2.py --- a/rpython/jit/backend/x86/detect_sse2.py +++ b/rpython/jit/backend/x86/detect_sse2.py @@ -2,35 +2,42 @@ from rpython.rtyper.lltypesystem import lltype, rffi from rpython.rlib.rmmap import alloc, free - -def detect_sse2(): +def cpu_info(instr): data = alloc(4096) pos = 0 - for c in ("\xB8\x01\x00\x00\x00" # MOV EAX, 1 - "\x53" # PUSH EBX - "\x0F\xA2" # CPUID - "\x5B" # POP EBX - "\x92" # XCHG EAX, EDX - "\xC3"): # RET + for c in instr: data[pos] = c pos += 1 fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data) code = fnptr() free(data, 4096) + return code + +def detect_sse2(): + code = cpu_info("\xB8\x01\x00\x00\x00" # MOV EAX, 1 + "\x53" # PUSH EBX + "\x0F\xA2" # CPUID + "\x5B" # POP EBX + "\x92" # XCHG EAX, EDX + "\xC3" # RET + ) return bool(code & (1<<25)) and bool(code & (1<<26)) +def byte_size_for_vector_registers(sse2, avx, avxbw): + if avx: + if avxbw: + return 64 + return 32 + if sse2: + return 16 + assert False, "No vector extention supported" + def detect_x32_mode(): - data = alloc(4096) - pos = 0 # 32-bit 64-bit / x32 - for c in ("\x48" # DEC EAX - "\xB8\xC8\x00\x00\x00"# MOV EAX, 200 MOV RAX, 0x40404040000000C8 - "\x40\x40\x40\x40" # 4x INC EAX - "\xC3"): # RET RET - data[pos] = c - pos += 1 - fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data) - code = fnptr() - free(data, 4096) + # 32-bit 64-bit / x32 + code = cpuinfo("\x48" # DEC EAX + "\xB8\xC8\x00\x00\x00"# MOV EAX, 200 MOV RAX, 0x40404040000000C8 + "\x40\x40\x40\x40" # 4x INC EAX + "\xC3") # RET RET assert code in (200, 204, 0x40404040000000C8) return code == 200 diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -1301,6 +1301,7 @@ else: src_locations2.append(src_loc) dst_locations2.append(dst_loc) + # Do we have a temp var? if IS_X86_64: tmpreg = X86_64_SCRATCH_REG @@ -1466,7 +1467,7 @@ not descr.is_array_of_structs() itemsize, ofs, _ = unpack_arraydescr(descr) integer = not descr.is_array_of_floats() - aligned = True + aligned = False args = op.getarglist() base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args) ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args) @@ -1487,7 +1488,7 @@ ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args) integer = not descr.is_array_of_floats() - aligned = True + aligned = False self.perform_discard(op, [base_loc, ofs_loc, value_loc, imm(itemsize), imm(ofs), imm(integer), imm(aligned)]) diff --git a/rpython/jit/backend/x86/runner.py b/rpython/jit/backend/x86/runner.py --- a/rpython/jit/backend/x86/runner.py +++ b/rpython/jit/backend/x86/runner.py @@ -24,6 +24,8 @@ with_threads = False frame_reg = regloc.ebp + vector_register_size = 0 # in bytes + from rpython.jit.backend.x86.arch import JITFRAME_FIXED_SIZE all_reg_indexes = gpr_reg_mgr_cls.all_reg_indexes gen_regs = gpr_reg_mgr_cls.all_regs @@ -148,6 +150,8 @@ IS_64_BIT = False + vector_register_size = 16 + def __init__(self, *args, **kwargs): assert sys.maxint == (2**31 - 1) super(CPU386, self).__init__(*args, **kwargs) @@ -163,4 +167,6 @@ IS_64_BIT = True + vector_register_size = 16 + CPU = CPU386 diff --git a/rpython/jit/backend/x86/test/test_vectorize.py b/rpython/jit/backend/x86/test/test_vectorize.py --- a/rpython/jit/backend/x86/test/test_vectorize.py +++ b/rpython/jit/backend/x86/test/test_vectorize.py @@ -11,9 +11,11 @@ from rpython.rtyper.lltypesystem import lltype -class TestBasic(test_basic.Jit386Mixin, test_vectorize.VectorizeLLtypeTests): +class TestBasic(test_vectorize.VectorizeLLtypeTests, test_basic.Jit386Mixin): # for the individual tests see # ====> ../../../metainterp/test/test_basic.py + enable_opts = 'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll' + pass diff --git a/rpython/jit/metainterp/optimizeopt/__init__.py b/rpython/jit/metainterp/optimizeopt/__init__.py --- a/rpython/jit/metainterp/optimizeopt/__init__.py +++ b/rpython/jit/metainterp/optimizeopt/__init__.py @@ -67,13 +67,13 @@ loop.logops = metainterp_sd.logger_noopt.log_loop(loop.inputargs, loop.operations) optimizations, unroll = build_opt_chain(metainterp_sd, enable_opts) - - if jitdriver_sd.vectorize: - optimize_vector(metainterp_sd, jitdriver_sd, loop, optimizations) - elif unroll: - return optimize_unroll(metainterp_sd, jitdriver_sd, loop, - optimizations, inline_short_preamble, - start_state, export_state) + if unroll: + if not export_state and warmstate.vectorize and jitdriver_sd.vectorize: + optimize_vector(metainterp_sd, jitdriver_sd, loop, optimizations) + else: + return optimize_unroll(metainterp_sd, jitdriver_sd, loop, + optimizations, inline_short_preamble, + start_state, export_state) else: optimizer = Optimizer(metainterp_sd, jitdriver_sd, loop, optimizations) diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -12,7 +12,7 @@ from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph from rpython.jit.metainterp.optimizeopt.unroll import Inliner from rpython.jit.metainterp.optimizeopt.vectorize import (VectorizingOptimizer, MemoryRef, - isomorphic, Pair, NotAVectorizeableLoop) + isomorphic, Pair, NotAVectorizeableLoop, NotAVectorizeableLoop) from rpython.jit.metainterp.optimize import InvalidLoop from rpython.jit.metainterp.history import ConstInt, BoxInt, get_const_ptr_for_string from rpython.jit.metainterp import executor, compile, resume @@ -22,6 +22,8 @@ class FakeJitDriverStaticData(object): vectorize=True +ARCH_VEC_REG_SIZE = 16 + class VecTestHelper(DependencyBaseTest): enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unfold" @@ -54,7 +56,7 @@ if unroll_factor == -1 and opt.smallest_type_bytes == 0: raise NotAVectorizeableLoop() if unroll_factor == -1: - unroll_factor = opt.get_unroll_count() + unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE) opt.unroll_loop_iterations(loop, unroll_factor) opt.loop.operations = opt.get_newoperations() opt.clear_newoperations() @@ -164,6 +166,18 @@ """ self.assert_unroll_loop_equals(self.parse_loop(ops), self.parse_loop(ops), 2) + def test_vectorize_empty_with_early_exit(self): + ops = """ + [] + guard_early_exit() [] + jump() + """ + try: + self.schedule(self.parse_loop(ops),1) + py.test.fail("empty loop with no memory references is not vectorizable") + except NotAVectorizeableLoop: + pass + def test_unroll_empty_stays_empty_parameter(self): """ same as test_unroll_empty_stays_empty but with a parameter """ ops = """ @@ -238,7 +252,7 @@ """ vopt = self.vectoroptimizer(self.parse_loop(ops)) assert 0 == vopt.smallest_type_bytes - assert 0 == vopt.get_unroll_count() + assert 0 == vopt.get_unroll_count(ARCH_VEC_REG_SIZE) def test_array_operation_indices_not_unrolled(self): ops = """ diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -71,20 +71,22 @@ self.clear_newoperations() label = self.loop.operations[0] jump = self.loop.operations[-1] - if jump.getopnum() != rop.LABEL: + if jump.getopnum() not in (rop.LABEL, rop.JUMP): # compile_loop appends a additional label to all loops # we cannot optimize normal traces + assert False raise NotAVectorizeableLoop() self.linear_find_smallest_type(self.loop) byte_count = self.smallest_type_bytes - if byte_count == 0 or label.getopnum() != rop.LABEL: + vsize = self.metainterp_sd.cpu.vector_register_size + if vsize == 0 or byte_count == 0 or label.getopnum() != rop.LABEL: # stop, there is no chance to vectorize this trace # we cannot optimize normal traces (if there is no label) raise NotAVectorizeableLoop() # unroll - self.unroll_count = self.get_unroll_count() + self.unroll_count = self.get_unroll_count(vsize) self.unroll_loop_iterations(self.loop, self.unroll_count) self.loop.operations = self.get_newoperations(); self.clear_newoperations(); @@ -97,6 +99,8 @@ self.schedule() def emit_operation(self, op): + if op.getopnum() == rop.GUARD_EARLY_EXIT: + return self._last_emitted_op = op self._newoperations.append(op) @@ -111,10 +115,15 @@ op_count = len(loop.operations) label_op = loop.operations[0].clone() - jump_op = loop.operations[op_count-1].clone() + assert label_op.getopnum() == rop.LABEL + jump_op = loop.operations[op_count-1] # use the target token of the label - jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, label_op.getdescr()) - assert label_op.getopnum() == rop.LABEL + assert jump_op.getopnum() in (rop.LABEL, rop.JUMP) + if jump_op.getopnum() == rop.LABEL: + jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, label_op.getdescr()) + else: + jump_op = jump_op.clone() + jump_op.setdescr(label_op.getdescr()) assert jump_op.is_final() self.emit_unrolled_operation(label_op) @@ -228,13 +237,12 @@ or byte_count < self.smallest_type_bytes: self.smallest_type_bytes = byte_count - def get_unroll_count(self): + def get_unroll_count(self, simd_vec_reg_bytes): """ This is an estimated number of further unrolls """ # this optimization is not opaque, and needs info about the CPU byte_count = self.smallest_type_bytes if byte_count == 0: return 0 - simd_vec_reg_bytes = 16 # TODO get from cpu unroll_count = simd_vec_reg_bytes // byte_count return unroll_count-1 # it is already unrolled once @@ -357,7 +365,9 @@ if not we_are_translated(): for node in self.dependency_graph.nodes: assert node.emitted - self.loop.operations = self.collapse_index_guards() + self.loop.operations = self._newoperations[:] + #self.collapse_index_guards() + #self.clear_newoperations() def relax_index_guards(self): label_idx = 0 diff --git a/rpython/jit/metainterp/pyjitpl.py b/rpython/jit/metainterp/pyjitpl.py --- a/rpython/jit/metainterp/pyjitpl.py +++ b/rpython/jit/metainterp/pyjitpl.py @@ -2135,8 +2135,10 @@ self.seen_loop_header_for_jdindex = -1 # can only emit early exit if liveness is present # TODO think of a better way later - if self.framestack[-1].jitcode.liveness.get(0, None): + if self.framestack[-1].jitcode.liveness.get(0, None) \ + and self.jitdriver_sd.vectorize: self.generate_guard(rop.GUARD_EARLY_EXIT) + #self.history.record(rop.GUARD_EARLY_EXIT, [], None) try: self.interpret() except SwitchToBlackhole, stb: diff --git a/rpython/jit/metainterp/test/support.py b/rpython/jit/metainterp/test/support.py --- a/rpython/jit/metainterp/test/support.py +++ b/rpython/jit/metainterp/test/support.py @@ -48,6 +48,7 @@ trace_limit = sys.maxint enable_opts = ALL_OPTS_DICT + vectorize = True if kwds.pop('disable_optimizations', False): FakeWarmRunnerState.enable_opts = {} diff --git a/rpython/jit/metainterp/test/test_ajit.py b/rpython/jit/metainterp/test/test_ajit.py --- a/rpython/jit/metainterp/test/test_ajit.py +++ b/rpython/jit/metainterp/test/test_ajit.py @@ -2764,9 +2764,13 @@ return i # seen = [] - def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, enable_opts, + def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, warmstate, *args, **kwds): - seen.append('unroll' in enable_opts) + if 'try_disabling_unroll' in kwds and \ + kwds['try_disabling_unroll']: + seen.append(False) + else: + seen.append('unroll' in warmstate.enable_opts) raise InvalidLoop old_optimize_trace = optimizeopt.optimize_trace optimizeopt.optimize_trace = my_optimize_trace diff --git a/rpython/jit/metainterp/test/test_vectorize.py b/rpython/jit/metainterp/test/test_vectorize.py --- a/rpython/jit/metainterp/test/test_vectorize.py +++ b/rpython/jit/metainterp/test/test_vectorize.py @@ -13,13 +13,14 @@ free_raw_storage, raw_storage_getitem) class VectorizeTests: - enable_opts = 'all' + enable_opts = 'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll' def meta_interp(self, f, args, policy=None): return ll_meta_interp(f, args, enable_opts=self.enable_opts, policy=policy, CPUClass=self.CPUClass, - type_system=self.type_system) + type_system=self.type_system, + vectorize=1) @py.test.mark.parametrize('i',[3,4,5,6,7,8,9,50]) def test_vectorize_simple_load_arith_store_int_add_index(self,i): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit