This is used instead of the equivalent C fast path. lowlevel-blt-bench results, compared to no fast path at all:
Before After Mean StdDev Mean StdDev Confidence Change L1 12.4 0.1 117.5 2.3 100.0% +851.2% L2 9.5 0.1 46.9 2.4 100.0% +393.8% M 9.6 0.0 61.9 0.9 100.0% +544.0% HT 7.9 0.0 26.6 0.5 100.0% +238.6% VT 7.7 0.0 24.2 0.4 100.0% +212.5% R 7.4 0.0 22.4 0.4 100.0% +204.5% RT 4.1 0.0 8.7 0.2 100.0% +109.4% --- pixman/pixman-arm-simd-asm.S | 111 ++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 4 ++ 2 files changed, 115 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 08d6709..6c77fd3 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -1291,3 +1291,114 @@ generate_composite_function \ over_n_0565_process_tail /******************************************************************************/ + +.macro in_8888_8_init + SRC0 .req Y + SRC1 .req STRIDE_D + SRC2 .req STRIDE_S + SRC3 .req MASK + HALF .req STRIDE_M + TMP .req ORIG_W + line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W + ldr SCRATCH, =0x00800080 + mov HALF, #0x80 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, SCRATCH, SCRATCH + /* Offset the source pointer: we only need the alpha bytes */ + add SRC, SRC, #3 +.endm + +.macro in_8888_8_cleanup + .unreq SRC0 + .unreq SRC1 + .unreq SRC2 + .unreq SRC3 + .unreq HALF + .unreq TMP +.endm + +.macro in_8888_8_4pixels_head dst + ldr TMP, [DST], #4 + ldrb SRC0, [SRC], #12 + ldrb SRC3, [SRC], #-4 + ldrb SRC2, [SRC], #-4 + uxtb16 WK&dst, TMP + uxtb16 TMP, TMP, ror #8 + ldrb SRC1, [SRC], #12 + smlabb SRC0, SRC0, WK&dst, HALF + smlabt SRC3, SRC3, TMP, HALF + smlabt SRC2, SRC2, WK&dst, HALF + smlabb SRC1, SRC1, TMP, HALF + orr WK&dst, SRC0, SRC2, lsl #16 + /* There'd be a stall here if immediately followed by orr, so + * fill it with something like a preload if possible */ +.endm + +.macro in_8888_8_4pixels_tail dst + orr TMP, SRC1, SRC3, lsl #16 + uxtab16 WK&dst, WK&dst, WK&dst, ror #8 + uxtab16 TMP, TMP, TMP, ror #8 + mov WK&dst, WK&dst, ror #8 + sel WK&dst, WK&dst, TMP +.endm + +.macro in_8888_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 1 + ldrb WK3, [DST], #1 + ldrb SRC0, [SRC], #4 + .elseif numbytes == 2 + ldrb WK3, [DST], #1 + ldrb SRC0, [SRC], #4 + ldrb TMP, [DST], #1 + ldrb SRC1, [SRC], #4 + .else + .if numbytes >= 8 + .if numbytes == 16 + in_8888_8_4pixels_head 0 + in_8888_8_4pixels_tail 0 + in_8888_8_4pixels_head 1 + .if preload + PF bic, SCRATCH, SRC, #31 + PF pld, [SCRATCH, #32*prefetch_distance] + .endif + in_8888_8_4pixels_tail 1 + .endif + in_8888_8_4pixels_head 2 + in_8888_8_4pixels_tail 2 + .endif + in_8888_8_4pixels_head 3 + .endif +.endm + +.macro in_8888_8_process_tail cond, numbytes, firstreg + .if numbytes == 1 + smlabb WK3, SRC0, WK3, HALF + add WK3, WK3, WK3, lsr #8 + mov WK3, WK3, lsr #8 + strb WK3, [DST, #-1] + .elseif numbytes == 2 + smlabb WK3, SRC0, WK3, HALF + smlabb TMP, SRC1, TMP, HALF + add WK3, WK3, WK3, lsr #8 + add TMP, TMP, TMP, lsr #8 + mov WK3, WK3, lsr #8 + mov TMP, TMP, lsr #8 + strb WK3, [DST, #-2] + strb TMP, [DST, #-1] + .else + in_8888_8_4pixels_tail 3 + pixst , numbytes, (4-numbytes/4), DST + .endif +.endm + +generate_composite_function \ + pixman_composite_in_8888_8_asm_armv6, 32, 0, 8, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS, \ + 3, /* prefetch distance */ \ + in_8888_8_init, \ + nop_macro, /* newline */ \ + in_8888_8_cleanup, \ + in_8888_8_process_head, \ + in_8888_8_process_tail + +/******************************************************************************/ diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 31f960d..76770bc 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -48,6 +48,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_8888_8, + uint32_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888, uint32_t, 1, uint32_t, 1) @@ -260,6 +262,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (IN, a8r8g8b8, null, a8, armv6_composite_in_8888_8), + PIXMAN_STD_FAST_PATH (IN, a8b8g8r8, null, a8, armv6_composite_in_8888_8), PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888), PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888), PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888), -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman