lowlevel-blt-bench results for two example operations, with and without masks, neither of which has a dedicated fast path at the time of writing:
over_reverse_n_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 9.4 0.0 21.7 0.1 100.0% +131.3% L2 9.2 0.1 20.6 0.3 100.0% +122.7% M 8.5 0.0 17.8 0.0 100.0% +109.4% HT 7.9 0.0 13.8 0.1 100.0% +75.8% VT 7.8 0.0 13.6 0.1 100.0% +74.4% R 7.6 0.0 13.0 0.1 100.0% +71.7% RT 5.0 0.0 6.8 0.1 100.0% +35.6% over_reverse_n_8_0565 Before After Mean StdDev Mean StdDev Confidence Change L1 5.6 0.0 14.7 0.1 100.0% +162.1% L2 5.7 0.0 14.6 0.2 100.0% +157.3% M 5.3 0.0 12.8 0.0 100.0% +140.3% HT 4.9 0.0 9.4 0.0 100.0% +93.5% VT 4.8 0.0 9.3 0.0 100.0% +91.5% R 4.7 0.0 8.7 0.0 100.0% +86.9% RT 2.9 0.0 4.0 0.1 100.0% +38.3% --- pixman/pixman-arm-simd-asm.S | 255 ++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 2 + 2 files changed, 257 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index ce0edfc..0c13a73 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -545,6 +545,261 @@ generate_composite_function \ /******************************************************************************/ +.macro over_reverse_8888_8888_init + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + msr CPSR_s, #0x50000 + + /* Keep WK0,WK1 where they are so they remain valid for LDRD */ + .unreq WK2 + .unreq WK3 + WK2 .req STRIDE_S + WK3 .req STRIDE_M + WK4 .req r10 + WK5 .req r11 + + line_saved_regs STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W +.endm + +.macro over_reverse_8888_8888_newline + ldr MASK, =0x00800080 + mov STRIDE_D, #0xff +.endm + +.macro over_reverse_8888_8888_cleanup + .unreq WK2 + .unreq WK3 + .unreq WK4 + .unreq WK5 + WK2 .req r10 + WK3 .req r11 +.endm + +.macro over_reverse_8888_8888_1pixel s0, d0, tmp0, tmp1, half, ff, offset + uxtb16 tmp0, s0 + sub tmp1, ff, d0, lsr #24 + uxtb16 s0, s0, ror #8 + mla tmp0, tmp0, tmp1, half + mla s0, s0, tmp1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + mov tmp0, tmp0, ror #8 + sel s0, tmp0, s0 + uqadd8 WK0, d0, s0 + str WK0, [DST, #offset] +.endm + +.macro over_reverse_8888_8888_2pixels s0, s1, d0, d1, tmp0, tmp1, tmp2, tmp3, half, ff, offset + uxtb16 tmp0, s0 + uxtb16 s0, s0, ror #8 + sub tmp2, ff, d0, lsr #24 + sub tmp3, ff, d1, lsr #24 + uxtb16 tmp1, s1 + uxtb16 s1, s1, ror #8 + mla tmp0, tmp0, tmp2, half + mla s0, s0, tmp2, half + mla tmp1, tmp1, tmp3, half + mla s1, s1, tmp3, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 s1, s1, s1, ror #8 + mov tmp0, tmp0, ror #8 + mov tmp1, tmp1, ror #8 + sel s0, tmp0, s0 + sel s1, tmp1, s1 + uqadd8 WK0, d0, s0 + uqadd8 WK1, d1, s1 + strd WK0, WK1, [DST, #offset] +.endm + +.macro over_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + pixld , 16, 2, DST, 0 + ldrd WK0, WK1, [SRC], #16 + and SCRATCH, WK2, WK3 + and SCRATCH, SCRATCH, WK4 + and SCRATCH, SCRATCH, WK5 + teq STRIDE_D, SCRATCH, lsr #24 + .elseif numbytes == 8 + pixld , 8, 2, DST, 0 + pixld , 8, 0, SRC, unaligned_src + and SCRATCH, WK2, WK3 + teq STRIDE_D, SCRATCH, lsr #24 + .else // numbytes == 4 + pixld , 4, 2, DST, 0 + pixld , 4, 0, SRC, unaligned_src + teq STRIDE_D, WK2, lsr #24 + .endif +.endm + +.macro over_reverse_8888_8888_process_tail cond, numbytes, firstreg + beq 10f // all destination pixels are opaque + .if numbytes == 16 + over_reverse_8888_8888_2pixels WK0, WK1, WK2, WK3, WK4, WK5, ORIG_W, SCRATCH, MASK, STRIDE_D, -16 + ldmdb SRC, {WK2, WK3} + ldmdb DST, {WK0, WK1} + over_reverse_8888_8888_2pixels WK2, WK3, WK0, WK1, WK4, WK5, ORIG_W, SCRATCH, MASK, STRIDE_D, -8 + .elseif numbytes == 8 + over_reverse_8888_8888_2pixels WK0, WK1, WK2, WK3, WK4, WK5, ORIG_W, SCRATCH, MASK, STRIDE_D, -8 + .else // numbytes == 4 + over_reverse_8888_8888_1pixel WK0, WK2, WK4, ORIG_W, MASK, STRIDE_D, -4 + .endif +10: +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_reverse_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + over_reverse_8888_8888_init, \ + over_reverse_8888_8888_newline, \ + over_reverse_8888_8888_cleanup, \ + over_reverse_8888_8888_process_head, \ + over_reverse_8888_8888_process_tail + +/******************************************************************************/ + +.macro over_reverse_8888_8888_8888_init + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + msr CPSR_s, #0x50000 + /* Point at alpha byte in mask */ + add MASK, MASK, #3 + + /* Keep WK0,WK1 where they are so they remain valid for LDRD */ + .unreq WK2 + .unreq WK3 + WK2 .req STRIDE_S + WK3 .req STRIDE_M + WK4 .req r10 + WK5 .req r11 + + line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W +.endm + +.macro over_reverse_8888_8888_8888_newline + ldr Y, =0x00800080 + mov STRIDE_D, #0xff +.endm + +.macro over_reverse_8888_8888_8888_cleanup + .unreq WK2 + .unreq WK3 + .unreq WK4 + .unreq WK5 + WK2 .req r10 + WK3 .req r11 +.endm + +.macro over_reverse_8888_8888_8888_1pixel s, m, d, tmp, half, ff, offset + uxtb16 tmp, s + uxtb16 s, s, ror #8 + mla tmp, tmp, m, half + mla s, s, m, half + sub m, ff, d, lsr #24 + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 s, s, s, ror #8 + uxtb16 tmp, tmp, ror #8 + uxtb16 s, s, ror #8 + mla tmp, tmp, m, half + mla s, s, m, half + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 s, s, s, ror #8 + mov tmp, tmp, ror #8 + sel s, tmp, s + uqadd8 d, d, s + str d, [DST, #offset] +.endm + +.macro over_reverse_8888_8888_8888_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset + uxtb16 tmp0, s0 + uxtb16 s0, s0, ror #8 + uxtb16 tmp1, s1 + uxtb16 s1, s1, ror #8 + mla tmp0, tmp0, m0, half + mla s0, s0, m0, half + mla tmp1, tmp1, m1, half + mla s1, s1, m1, half + sub m0, ff, d0, lsr #24 + sub m1, ff, d1, lsr #24 + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 s1, s1, s1, ror #8 + uxtb16 tmp0, tmp0, ror #8 + uxtb16 s0, s0, ror #8 + uxtb16 tmp1, tmp1, ror #8 + uxtb16 s1, s1, ror #8 + mla tmp0, tmp0, m0, half + mla s0, s0, m0, half + mla tmp1, tmp1, m1, half + mla s1, s1, m1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 s0, s0, s0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 s1, s1, s1, ror #8 + mov tmp0, tmp0, ror #8 + mov tmp1, tmp1, ror #8 + sel s0, tmp0, s0 + sel s1, tmp1, s1 + uqadd8 WK0, d0, s0 + uqadd8 WK1, d1, s1 + strd WK0, WK1, [DST, #offset] +.endm + +.macro over_reverse_8888_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + pixld , 16, 2, DST, 0 + ldrd WK0, WK1, [SRC], #16 + ldrb ORIG_W, [MASK], #16 + and SCRATCH, WK2, WK3 + and SCRATCH, SCRATCH, WK4 + and SCRATCH, SCRATCH, WK5 + teq STRIDE_D, SCRATCH, lsr #24 + .elseif numbytes == 8 + pixld , 8, 2, DST, 0 + pixld , 8, 0, SRC, unaligned_src + ldrb ORIG_W, [MASK], #8 + and SCRATCH, WK2, WK3 + teq STRIDE_D, SCRATCH, lsr #24 + .else // numbytes == 4 + pixld , 4, 2, DST, 0 + pixld , 4, 0, SRC, unaligned_src + ldrb ORIG_W, [MASK], #4 + teq STRIDE_D, WK2, lsr #24 + .endif +.endm + +.macro over_reverse_8888_8888_8888_process_tail cond, numbytes, firstreg + beq 10f // all destination pixels are opaque + .if numbytes == 16 + ldrb SCRATCH, [MASK, #-12] + over_reverse_8888_8888_8888_2pixels WK0, WK1, ORIG_W, SCRATCH, WK2, WK3, WK4, WK5, Y, STRIDE_D, -16 + ldmdb SRC, {WK2, WK3} + ldrb ORIG_W, [MASK, #-8] + ldrb SCRATCH, [MASK, #-4] + ldmdb DST, {WK0, WK1} + over_reverse_8888_8888_8888_2pixels WK2, WK3, ORIG_W, SCRATCH, WK0, WK1, WK4, WK5, Y, STRIDE_D, -8 + .elseif numbytes == 8 + ldrb SCRATCH, [MASK, #-4] + over_reverse_8888_8888_8888_2pixels WK0, WK1, ORIG_W, SCRATCH, WK2, WK3, WK4, WK5, Y, STRIDE_D, -8 + .else // numbytes == 4 + over_reverse_8888_8888_8888_1pixel WK0, ORIG_W, WK2, WK4, Y, STRIDE_D, -4 + .endif +10: +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_reverse_mask_asm_armv6, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + over_reverse_8888_8888_8888_init, \ + over_reverse_8888_8888_8888_newline, \ + over_reverse_8888_8888_8888_cleanup, \ + over_reverse_8888_8888_8888_process_head, \ + over_reverse_8888_8888_8888_process_tail + +/******************************************************************************/ + /* Multiply each byte of a word by a byte. * Useful when there aren't any obvious ways to fill the stalls with other instructions. * word Register containing 4 bytes diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 834995a..69c46c5 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -81,6 +81,7 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, uint32_t, uint32_t) +PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse) PIXMAN_ARM_BIND_COMBINE_U (armv6, add) void @@ -307,6 +308,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u; imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u; imp->blt = arm_simd_blt; -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman