Based upon the implementation of the out_reverse combiner (which has been reordered to later in the patch series), this does a better job of scheduling than the previous version by processing two pixels at the same time, at the cost of spilling WK0 to the stack during the leading-pixel phase.
lowlevel-blt-bench results are as follows: Before After Mean StdDev Mean StdDev Confidence Change L1 37.5 0.3 40.5 0.3 100.0% +7.9% L2 28.8 0.6 29.2 0.9 98.6% +1.3% (insignificant) M 28.3 0.0 29.2 0.0 100.0% +3.3% HT 15.6 0.1 16.0 0.1 100.0% +2.3% VT 14.7 0.1 15.2 0.1 100.0% +2.9% R 15.8 0.1 16.0 0.1 100.0% +1.1% RT 7.8 0.1 7.9 0.1 100.0% +1.1% --- pixman/pixman-arm-simd-asm.S | 147 ++++++++++++++++++++++-------------------- 1 files changed, 78 insertions(+), 69 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index ca34b5e..37e9f33 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -428,27 +428,6 @@ generate_composite_function \ /******************************************************************************/ -.macro over_8888_8888_init - /* Hold loop invariant in MASK */ - ldr MASK, =0x00800080 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK - line_saved_regs STRIDE_D, STRIDE_S, ORIG_W -.endm - -.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W - pixld , numbytes, %(4+firstreg), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ teq WK®0, #0 @@ -461,65 +440,95 @@ generate_composite_function \ .endif .endm -.macro over_8888_8888_prepare next - mov WK&next, WK&next, lsr #24 +.macro over_8888_8888_init + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + msr CPSR_s, #0x50000 + line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W .endm -.macro over_8888_8888_1pixel src, dst, offset, next - /* src = destination component multiplier */ - rsb WK&src, WK&src, #255 - /* Split even/odd bytes of dst into SCRATCH/dst */ - uxtb16 SCRATCH, WK&dst - uxtb16 WK&dst, WK&dst, ror #8 - /* Multiply through, adding 0.5 to the upper byte of result for rounding */ - mla SCRATCH, SCRATCH, WK&src, MASK - mla WK&dst, WK&dst, WK&src, MASK - /* Where we would have had a stall between the result of the first MLA and the shifter input, - * reload the complete source pixel */ - ldr WK&src, [SRC, #offset] - /* Multiply by 257/256 to approximate 256/255 */ - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - /* In this stall, start processing the next pixel */ - .if offset < -4 - mov WK&next, WK&next, lsr #24 +.macro over_8888_8888_newline + ldr Y, =0x00800080 + mov STRIDE_D, #0xff +.endm + +.macro over_8888_8888_1pixel s, m, d, tmp, half, ff, offset + sub m, ff, s, lsr #24 + uxtb16 tmp, d + uxtb16 d, d, ror #8 + mla tmp, tmp, m, half + mla d, d, m, half + uxtab16 tmp, tmp, tmp, ror #8 + uxtab16 d, d, d, ror #8 + mov tmp, tmp, ror #8 + sel d, tmp, d + uqadd8 d, d, s + str d, [DST, #offset] +.endm + +.macro over_8888_8888_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset + sub m0, ff, s0, lsr #24 + sub m1, ff, s1, lsr #24 + uxtb16 tmp0, d0 + uxtb16 d0, d0, ror #8 + uxtb16 tmp1, d1 + uxtb16 d1, d1, ror #8 + mla tmp0, tmp0, m0, half + mla d0, d0, m0, half + mla tmp1, tmp1, m1, half + mla d1, d1, m1, half + uxtab16 tmp0, tmp0, tmp0, ror #8 + uxtab16 d0, d0, d0, ror #8 + uxtab16 tmp1, tmp1, tmp1, ror #8 + uxtab16 d1, d1, d1, ror #8 + mov tmp0, tmp0, ror #8 + mov tmp1, tmp1, ror #8 + sel d0, tmp0, d0 + sel d1, tmp1, d1 + uqadd8 d0, d0, s0 + uqadd8 d1, d1, s1 + strd d0, d1, [DST, #offset] +.endm + +.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + ldm SRC!, {WK0, WK1} + ldm SRC!, {STRIDE_S, STRIDE_M} + ldrd WK2, WK3, [DST], #16 + orr SCRATCH, WK0, WK1 + orr SCRATCH, SCRATCH, STRIDE_S + orrs SCRATCH, SCRATCH, STRIDE_M + beq 20f + over_8888_8888_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -16 + ldrd WK0, WK1, [SRC, #-8] + ldrd WK2, WK3, [DST, #-8] + .elseif numbytes == 8 + ldrd WK0, WK1, [SRC], #8 + ldrd WK2, WK3, [DST], #8 + orrs SCRATCH, WK0, WK1 + beq 20f + .else // numbytes == 4 + ldr WK0, [SRC], #4 + ldr WK2, [DST], #4 + teq WK0, #0 + beq 20f .endif - uxtab16 WK&dst, WK&dst, WK&dst, ror #8 - /* Recombine even/odd bytes of multiplied destination */ - mov SCRATCH, SCRATCH, ror #8 - sel WK&dst, SCRATCH, WK&dst - /* Saturated add of source to multiplied destination */ - uqadd8 WK&dst, WK&dst, WK&src .endm .macro over_8888_8888_process_tail cond, numbytes, firstreg - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) - beq 10f - over_8888_8888_prepare %(4+firstreg) - .set PROCESS_REG, firstreg - .set PROCESS_OFF, -numbytes - .rept numbytes / 4 - over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .set PROCESS_OFF, PROCESS_OFF+4 - .endr - pixst , numbytes, firstreg, DST -10: - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 + .if numbytes >= 8 + over_8888_8888_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -8 + .else // numbytes == 4 + over_8888_8888_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -4 + .endif +20: .endm generate_composite_function \ - pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 2, /* prefetch distance */ \ over_8888_8888_init, \ - nop_macro, /* newline */ \ + over_8888_8888_newline, \ nop_macro, /* cleanup */ \ over_8888_8888_process_head, \ over_8888_8888_process_tail -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman