lowlevel-blt-bench results: Before After Mean StdDev Mean StdDev Confidence Change L1 21.3 0.1 32.5 0.2 100.0% +52.1% L2 12.1 0.2 19.5 0.5 100.0% +61.2% M 11.0 0.0 17.1 0.0 100.0% +54.6% HT 8.7 0.0 12.8 0.1 100.0% +46.9% VT 8.6 0.0 12.5 0.1 100.0% +46.0% R 8.6 0.0 12.0 0.1 100.0% +40.6% RT 5.1 0.1 6.6 0.1 100.0% +28.8%
Trimmed cairo-perf-trace results: Before After Mean StdDev Mean StdDev Confidence Change t-firefox-paintball 17.7 0.1 14.2 0.1 100.0% +24.5% --- pixman/pixman-arm-simd-asm.S | 104 ++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 8 +++ 2 files changed, 112 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index ac084c4..20ad05a 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -875,6 +875,110 @@ generate_composite_function \ /******************************************************************************/ +.macro in_reverse_8888_8888_init + /* Hold loop invariant in MASK */ + ldr MASK, =0x00800080 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, MASK, MASK + /* Offset the source pointer: we only need the alpha bytes */ + add SRC, SRC, #3 + line_saved_regs ORIG_W +.endm + +.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 + ldrb ORIG_W, [SRC], #4 + .if numbytes >= 8 + ldrb WK®1, [SRC], #4 + .if numbytes == 16 + ldrb WK®2, [SRC], #4 + ldrb WK®3, [SRC], #4 + .endif + .endif + add DST, DST, #numbytes +.endm + +.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) +.endm + +.macro in_reverse_8888_8888_1pixel s, d, offset, is_only + .if is_only != 1 + movs s, ORIG_W + .if offset != 0 + ldrb ORIG_W, [SRC, #offset] + .endif + beq 01f + teq STRIDE_M, #0xFF + beq 02f + .endif + uxtb16 SCRATCH, d /* rb_dest */ + uxtb16 d, d, ror #8 /* ag_dest */ + mla SCRATCH, SCRATCH, s, MASK + mla d, d, s, MASK + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + uxtab16 d, d, d, ror #8 + mov SCRATCH, SCRATCH, ror #8 + sel d, SCRATCH, d + b 02f + .if offset == 0 +48: /* Last mov d,#0 of the set - used as part of shortcut for + * source values all 0 */ + .endif +01: mov d, #0 +02: +.endm + +.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 + .if numbytes == 4 + teq ORIG_W, ORIG_W, asr #32 + ldrne WK®1, [DST, #-4] + .elseif numbytes == 8 + teq ORIG_W, WK®1 + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ + ldmnedb DST, {WK®1-WK®2} + .else + teq ORIG_W, WK®1 + teqeq ORIG_W, WK®2 + teqeq ORIG_W, WK®3 + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ + ldmnedb DST, {WK®1-WK®4} + .endif + cmnne DST, #0 /* clear C if NE */ + bcs 49f /* no writes to dest if source all -1 */ + beq 48f /* set dest to all 0 if source all 0 */ + .if numbytes == 4 + in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 + str WK®1, [DST, #-4] + .elseif numbytes == 8 + in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 + stmdb DST, {WK®1-WK®2} + .else + in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 + stmdb DST, {WK®1-WK®4} + .endif +49: +.endm + +.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg + in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) +.endm + +generate_composite_function \ + pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ + 2, /* prefetch distance */ \ + in_reverse_8888_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + in_reverse_8888_8888_process_head, \ + in_reverse_8888_8888_process_tail + +/******************************************************************************/ + #ifdef PROFILING .p2align 9 #endif diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index b97c7ec..5a50098 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -47,6 +47,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888, + uint32_t, 1, uint32_t, 1) + PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -235,6 +238,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888), + PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888), + PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888), + PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca), -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman