Author: Richard Plangger <r...@pasra.at> Branch: vecopt Changeset: r78292:e70ae41089d7 Date: 2015-06-24 13:58 +0200 http://bitbucket.org/pypy/pypy/changeset/e70ae41089d7/
Log: doc additions, reenabled the int8 expand test (passes now) diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py --- a/pypy/module/micronumpy/test/test_zjit.py +++ b/pypy/module/micronumpy/test/test_zjit.py @@ -274,10 +274,9 @@ sum(d) """ def test_int8_expand(self): - py.test.skip("TODO implement assembler") result = self.run("int8_expand") - assert int(result) == 8*8 + sum(range(0,17)) - self.check_vectorized(3, 2) # TODO sum at the end + assert int(result) == 17*8 + sum(range(0,17)) + self.check_vectorized(3, 1) # TODO sum at the end def define_int32_add_const(): return """ diff --git a/rpython/doc/jit/vectorization.rst b/rpython/doc/jit/vectorization.rst --- a/rpython/doc/jit/vectorization.rst +++ b/rpython/doc/jit/vectorization.rst @@ -2,7 +2,10 @@ Vectorization ============= -TBA +To find parallel instructions the tracer must provide enough information about +memory load/store operations. They must be adjacent in memory. The requirement for +that is that they use the same index variable and offset can be expressed as a +a linear or affine combination. Features -------- @@ -13,6 +16,9 @@ * int8/int16/int32/int64 arithmetic: add, substract, multiply, negate, absolute * int8/int16/int32/int64 logical: and, or, xor +Reduction +--------- + Reduction is implemented: * sum @@ -21,10 +27,13 @@ * all, any, prod, min, max -To find parallel instructions the tracer must provide enough information about -memory load/store operations. They must be adjacent in memory. The requirement for -that is that they use the same index variable and offset can be expressed as a -a linear or affine combination. +Constant & Variable Expansion +----------------------------- + +Packed arithmetic operations expand scalar variables or contants into vector registers. + +Guard Strengthening +------------------- Unrolled guards are strengthend on a arithmetical level (See GuardStrengthenOpt). The resulting vector trace will only have one guard that checks the index. diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py --- a/rpython/jit/backend/x86/rx86.py +++ b/rpython/jit/backend/x86/rx86.py @@ -746,6 +746,7 @@ PSHUFLW_xxi = xmminsn('\xF2', rex_nw, '\x0F\x70', register(1,8), register(2), '\xC0', immediate(3, 'b')) PSHUFB_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), register(2), '\xC0') PSHUFB_xm = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), mem_reg_plus_const(2)) + PSHUFB_xj = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), abs_(2)) # SSE3 HADDPD_xx = xmminsn('\x66', rex_nw, '\x0F\x7C', register(1,8), register(2), '\xC0') _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit