lowlevel-blt-bench results for example operations, with and without masks, none of which has a dedicated fast path at the time of writing:
in_n_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 11.0 0.0 16.1 0.1 100.0% +46.6% L2 10.7 0.1 15.6 0.2 100.0% +46.0% M 9.6 0.0 14.1 0.0 100.0% +46.6% HT 8.8 0.0 11.2 0.0 100.0% +26.3% VT 8.8 0.0 11.0 0.0 100.0% +26.1% R 8.5 0.0 10.6 0.0 100.0% +25.3% RT 5.4 0.0 6.0 0.1 100.0% +11.0% in_n_8_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 6.3 0.0 10.5 0.0 100.0% +67.0% L2 6.4 0.0 10.4 0.1 100.0% +63.9% M 5.9 0.0 9.8 0.0 100.0% +65.9% HT 5.4 0.0 7.3 0.0 100.0% +36.0% VT 5.3 0.0 7.2 0.0 100.0% +35.7% R 5.1 0.0 6.9 0.0 100.0% +33.9% RT 3.1 0.0 3.6 0.0 100.0% +14.5% inrev_n_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 11.0 0.0 15.5 0.1 100.0% +40.9% L2 10.7 0.1 15.0 0.2 100.0% +40.6% M 9.6 0.0 13.6 0.0 100.0% +41.9% HT 8.8 0.0 11.0 0.0 100.0% +24.4% VT 8.8 0.0 10.8 0.0 100.0% +23.9% R 8.5 0.0 10.4 0.0 100.0% +23.0% RT 5.4 0.1 6.0 0.1 100.0% +10.9% inrev_n_8_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 6.8 0.0 11.0 0.1 100.0% +61.5% L2 6.9 0.1 10.9 0.1 100.0% +58.0% M 6.3 0.0 10.2 0.0 100.0% +61.2% HT 5.7 0.0 7.6 0.0 100.0% +32.2% VT 5.7 0.0 7.5 0.0 100.0% +32.5% R 5.5 0.0 7.2 0.0 100.0% +30.5% RT 3.2 0.0 3.6 0.0 100.0% +12.6% out_n_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 10.8 0.0 15.6 0.1 100.0% +44.6% L2 10.4 0.1 15.3 0.1 100.0% +46.7% M 9.4 0.0 13.8 0.0 100.0% +46.2% HT 8.7 0.0 11.0 0.0 100.0% +25.7% VT 8.6 0.0 10.8 0.1 100.0% +25.4% R 8.4 0.0 10.4 0.0 100.0% +24.3% RT 5.4 0.0 5.7 0.1 100.0% +6.4% out_n_8_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 6.2 0.0 10.3 0.0 100.0% +66.1% L2 6.2 0.1 10.3 0.1 100.0% +65.7% M 5.6 0.2 9.7 0.0 100.0% +72.9% HT 5.3 0.0 7.2 0.0 100.0% +36.1% VT 5.3 0.0 7.1 0.0 100.0% +35.7% R 5.1 0.0 6.8 0.0 100.0% +33.9% RT 3.1 0.0 3.5 0.0 100.0% +13.7% outrev_n_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 9.8 0.0 15.3 0.1 100.0% +55.4% L2 9.5 0.1 15.0 0.2 100.0% +57.4% M 8.6 0.0 13.6 0.0 100.0% +57.8% HT 8.1 0.0 10.9 0.0 100.0% +34.5% VT 8.0 0.0 10.8 0.0 100.0% +34.0% R 7.8 0.0 10.3 0.0 100.0% +32.6% RT 5.1 0.1 5.8 0.1 100.0% +13.3% outrev_n_8_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 6.3 0.0 11.0 0.1 100.0% +73.8% L2 6.4 0.0 11.0 0.1 100.0% +73.1% M 5.4 0.2 10.2 0.0 100.0% +87.7% HT 5.4 0.0 7.6 0.0 100.0% +40.4% VT 5.4 0.0 7.5 0.0 100.0% +40.0% R 5.2 0.0 7.2 0.0 100.0% +38.3% RT 3.1 0.0 3.6 0.1 100.0% +15.9% --- pixman/pixman-arm-simd-asm.S | 376 ++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 8 + 2 files changed, 384 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 0c13a73..15eabe2 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -2048,3 +2048,379 @@ generate_composite_function_single_scanline \ add_8888_8888_8888_process_tail /******************************************************************************/ + +.macro inout_init + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + msr CPSR_s, #0x50000 + /* Point at alpha byte in source / mask / dest */ + .if REVERSE + add SRC, SRC, #3 + .endif + .if WITH_MASK + add MASK, MASK, #3 + .endif + .if !REVERSE + add DST, DST, #3 + .endif + line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W +.endm + +.macro inout_newline + ldr Y, =0x00800080 + mov STRIDE_D, #0xff +.endm + +.macro inout_1pixel s, m, d, tmp, half, ff, offset + uxtb16 tmp, s + .if !IN_NOT_OUT + bic d, ff, d + .endif + uxtb16 s, s, ror #8 + .if WITH_MASK + mla tmp, tmp, m, half + mla s, s, m, half + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 s, s, s, ror #8 + uxtb16 tmp, tmp, ror #8 + uxtb16 s, s, ror #8 + .endif + mla tmp, tmp, d, half + mla s, s, d, half + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 s, s, s, ror #8 + mov tmp, tmp, ror #8 + sel s, tmp, s + str s, [DST, #offset] +.endm + +.macro inout_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset + uxtb16 tmp0, s0 + uxtb16 s0, s0, ror #8 + .if !IN_NOT_OUT + bic d0, ff, d0 + bic d1, ff, d1 + .endif + uxtb16 tmp1, s1 + uxtb16 s1, s1, ror #8 + .if WITH_MASK + mla tmp0, tmp0, m0, half + mla s0, s0, m0, half + mla tmp1, tmp1, m1, half + mla s1, s1, m1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 s1, s1, s1, ror #8 + uxtb16 tmp0, tmp0, ror #8 + uxtb16 s0, s0, ror #8 + uxtb16 tmp1, tmp1, ror #8 + uxtb16 s1, s1, ror #8 + .endif + mla tmp0, tmp0, d0, half + mla s0, s0, d0, half + mla tmp1, tmp1, d1, half + mla s1, s1, d1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 s1, s1, s1, ror #8 + mov tmp0, tmp0, ror #8 + mov tmp1, tmp1, ror #8 + sel s0, tmp0, s0 + sel s1, tmp1, s1 + strd s0, s1, [DST, #offset] +.endm + +.macro inout_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes >= 8 + pixld , 8, 0, SRC, unaligned_src + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + ldrb STRIDE_M, [MASK], #4 + .endif + ldrb WK2, [DST], #4 + ldrb WK3, [DST], #4 + .if numbytes == 16 + inout_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11 + pixld , 8, 0, SRC, unaligned_src + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + ldrb STRIDE_M, [MASK], #4 + .endif + ldrb WK2, [DST], #4 + ldrb WK3, [DST], #4 + .endif + .else // numbytes == 4 + pixld , 4, 0, SRC, unaligned_src + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + .endif + ldrb WK2, [DST], #4 + .endif +.endm + +.macro inout_process_tail cond, numbytes, firstreg + .if numbytes >= 8 + inout_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11 + .else // numbytes == 4 + inout_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -7 + .endif +.endm + +.macro inout_reverse_1pixel s, m, d, tmp, half, ff, offset + .if WITH_MASK + smlabb s, s, m, half + uxtb16 tmp, d + uxtb16 d, d, ror #8 + uxtab s, s, s, ror #8 + .if IN_NOT_OUT + and s, ff, s, lsr #8 + .else + bic s, ff, s, lsr #8 + .endif + .else + .if !IN_NOT_OUT + bic s, ff, s + .endif + uxtb16 tmp, d + uxtb16 d, d, ror #8 + .endif + mla tmp, tmp, s, half + mla d, d, s, half + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 d, d, d, ror #8 + mov tmp, tmp, ror #8 + sel d, tmp, d + str d, [DST, #offset] +.endm + +.macro inout_reverse_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset + .if WITH_MASK + smlabb s0, s0, m0, half + smlabb s1, s1, m1, half + uxtb16 tmp0, d0 + uxtb16 d0, d0, ror #8 + uxtab s0, s0, s0, ror #8 + uxtab s1, s1, s1, ror #8 + .if IN_NOT_OUT + and s0, ff, s0, lsr #8 + and s1, ff, s1, lsr #8 + .else + bic s0, ff, s0, lsr #8 + bic s1, ff, s1, lsr #8 + .endif + .else + .if !IN_NOT_OUT + bic s0, ff, s0 + bic s1, ff, s1 + .endif + uxtb16 tmp0, d0 + uxtb16 d0, d0, ror #8 + .endif + uxtb16 tmp1, d1 + uxtb16 d1, d1, ror #8 + mla tmp0, tmp0, s0, half + mla d0, d0, s0, half + mla tmp1, tmp1, s1, half + mla d1, d1, s1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 d0, d0, d0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 d1, d1, d1, ror #8 + mov tmp0, tmp0, ror #8 + mov tmp1, tmp1, ror #8 + sel d0, tmp0, d0 + sel d1, tmp1, d1 + strd d0, d1, [DST, #offset] +.endm + +.macro inout_reverse_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + ldrb WK0, [SRC], #4 + ldrb WK1, [SRC], #4 + ldrb SCRATCH, [SRC], #4 + ldrb ORIG_W, [SRC], #4 + .if IN_NOT_OUT + and WK2, WK0, WK1 + and WK2, WK2, SCRATCH + and WK2, WK2, ORIG_W + bics WK2, STRIDE_D, WK2 + .else + orr WK2, WK0, WK1 + orr WK2, WK2, SCRATCH + orrs WK2, WK2, ORIG_W + .endif + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + ldrb STRIDE_M, [MASK], #4 + ldrb SCRATCH, [MASK], #4 + ldrb ORIG_W, [MASK], #4 + .endif + ldrd WK2, WK3, [DST], #16 + .if WITH_MASK + bne 10f + .if IN_NOT_OUT + and SCRATCH, SCRATCH, ORIG_W + and SCRATCH, SCRATCH, STRIDE_S + and SCRATCH, SCRATCH, STRIDE_M + bics SCRATCH, STRIDE_D, SCRATCH + .else + orr SCRATCH, SCRATCH, ORIG_W + orr SCRATCH, SCRATCH, STRIDE_S + orrs SCRATCH, SCRATCH, STRIDE_M + .endif + .endif + beq 20f +10: + inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -16 + .if IN_NOT_OUT && !WITH_MASK + ldrd WK2, WK3, [DST, #-8] + .endif + ldrb WK0, [SRC, #-8] + ldrb WK1, [SRC, #-4] + .if WITH_MASK + ldrb STRIDE_S, [MASK, #-8] + ldrb STRIDE_M, [MASK, #-4] + .endif + .if !(IN_NOT_OUT && !WITH_MASK) + ldrd WK2, WK3, [DST, #-8] + .endif + .elseif numbytes == 8 + .if IN_NOT_OUT && !WITH_MASK + ldrd WK2, WK3, [DST], #8 + .endif + ldrb WK0, [SRC], #4 + ldrb WK1, [SRC], #4 + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + ldrb STRIDE_M, [MASK], #4 + .endif + .if !(IN_NOT_OUT && !WITH_MASK) + ldrd WK2, WK3, [DST], #8 + .endif + .else // numbytes == 4 + .if IN_NOT_OUT && !WITH_MASK + ldr WK2, [DST], #4 + .endif + ldrb WK0, [SRC], #4 + .if WITH_MASK + ldrb STRIDE_S, [MASK], #4 + .endif + .if !(IN_NOT_OUT && !WITH_MASK) + ldr WK2, [DST], #4 + .endif + .endif +.endm + +.macro inout_reverse_process_tail cond, numbytes, firstreg + .if numbytes >= 8 + inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -8 + .else // numbytes == 4 + inout_reverse_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -4 + .endif +20: +.endm + +.set REVERSE, 0 +.set IN_NOT_OUT, 0 +.set WITH_MASK, 0 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_process_head, \ + inout_process_tail + +.set WITH_MASK, 1 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_mask_asm_armv6, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_process_head, \ + inout_process_tail + +.set IN_NOT_OUT, 1 +.set WITH_MASK, 0 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_in_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_process_head, \ + inout_process_tail + +.set WITH_MASK, 1 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_in_mask_asm_armv6, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_process_head, \ + inout_process_tail + +.set REVERSE, 1 +.set IN_NOT_OUT, 0 +.set WITH_MASK, 0 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_reverse_process_head, \ + inout_reverse_process_tail + +.set WITH_MASK, 1 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_mask_asm_armv6, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_reverse_process_head, \ + inout_reverse_process_tail + +.set IN_NOT_OUT, 1 +.set WITH_MASK, 0 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_in_reverse_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_reverse_process_head, \ + inout_reverse_process_tail + +.set WITH_MASK, 1 + +generate_composite_function_single_scanline \ + pixman_composite_scanline_in_reverse_mask_asm_armv6, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + inout_init, \ + inout_newline, \ + nop_macro, /* cleanup */ \ + inout_reverse_process_head, \ + inout_reverse_process_tail + +/******************************************************************************/ diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 69c46c5..7f7d8c0 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -82,6 +82,10 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, uint32_t, uint32_t) PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse) +PIXMAN_ARM_BIND_COMBINE_U (armv6, in) +PIXMAN_ARM_BIND_COMBINE_U (armv6, in_reverse) +PIXMAN_ARM_BIND_COMBINE_U (armv6, out) +PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse) PIXMAN_ARM_BIND_COMBINE_U (armv6, add) void @@ -309,6 +313,10 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = armv6_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = armv6_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = armv6_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u; imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u; imp->blt = arm_simd_blt; -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman