One is that the pixel count wasn't being shifted correctly for 8bpp or 16bpp images in the narrow case. The fix is illustrated by src_8_8:
Before After Mean StdDev Mean StdDev Change Confidence L1 592.6 40.4 615.1 75.6 3.8% 56% (insignificant) L2 235.0 5.5 230.5 19.8 -1.9% 48% (insignificant) M 229.2 2.2 229.0 1.5 -0.1% 20% (insignificant) HT 60.0 0.4 62.4 0.6 4.0% 100.0% VT 52.9 0.5 53.4 0.5 0.9% 94.5% (insignificant) R 45.2 0.4 47.7 0.7 5.6% 100.0% RT 12.0 0.4 12.1 1.6 0.8% 14% (insignificant) The second one meant that only the source pointer was being used for preloads for mid-width rectangles (typically between 32-160 bytes). A routine that illustrates this is over_8888_8888, where the destination buffer is supposed to be preloaded: Before After Mean StdDev Mean StdDev Change Confidence L1 37.6 0.4 37.9 0.3 1.0% 99.5% L2 30.8 0.5 30.8 0.5 0.1% 22% (insignificant) M 25.8 0.0 25.8 0.0 0.0% 21% (insignificant) HT 14.4 0.1 15.5 0.1 8.0% 100.0% VT 13.8 0.1 14.6 0.1 6.2% 100.0% R 14.3 0.1 15.7 0.1 10.3% 100.0% RT 6.7 0.4 7.6 0.4 12.5% 100.0% This bug also explains why medium-width rectangle prefetch was a regression for over_n_8_8888. Now it can be re-enabled, and results are: Before After Mean StdDev Mean StdDev Change Confidence L1 22.8 0.2 22.8 0.2 -0.2% 41% (insignificant) L2 21.8 0.1 21.8 0.1 0.1% 37% (insignificant) M 22.2 0.0 22.2 0.1 -0.1% 56% (insignificant) HT 12.3 0.1 14.1 0.1 14.4% 100.0% VT 11.7 0.1 13.3 0.6 13.8% 100.0% R 10.9 0.1 12.8 0.5 17.1% 100.0% RT 5.9 0.1 6.5 0.1 11.3% 100.0% --- pixman/pixman-arm-simd-asm.S | 2 +- pixman/pixman-arm-simd-asm.h | 22 ++++++---------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 8700da9..f043826 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -576,7 +576,7 @@ generate_composite_function \ generate_composite_function \ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_ONLY_PRELOAD_WIDE \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 2, /* prefetch distance */ \ over_n_8_8888_init, \ over_n_8_8888_newline, \ diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h index c1db3fc..ee70131 100644 --- a/pixman/pixman-arm-simd-asm.h +++ b/pixman/pixman-arm-simd-asm.h @@ -232,7 +232,7 @@ /* In these cases, each line for each channel is in either 1 or 2 cache lines */ PF bic, WK0, base, #31 PF pld, [WK0] - PF add, WK1, base, X, LSL #2 + PF add, WK1, base, X, LSL #bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 @@ -240,9 +240,9 @@ PF pld, [WK1] 90: .else - PF bic, WK0, SRC, #31 + PF bic, WK0, base, #31 PF pld, [WK0] - PF add, WK1, SRC, X, lsl #bpp_shift + PF add, WK1, base, X, lsl #bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 @@ -399,18 +399,8 @@ preload_trailing mask_bpp, mask_bpp_shift, MASK preload_trailing dst_r_bpp, dst_bpp_shift, DST add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp -113: - process_head , 16, 0, unaligned_src, unaligned_mask, 0 - process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - subs X, X, #128/dst_w_bpp - bhs 113b - /* Trailing pixels */ - tst X, #128/dst_w_bpp - 1 - beq exit_label - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + /* The remainder of the line is handled identically to the medium case */ + medium_case_inner_loop_and_trailing_pixels process_head, process_tail, exit_label, unaligned_src, unaligned_mask .endm .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, exit_label, unaligned_src, unaligned_mask @@ -723,7 +713,7 @@ fname: sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ tst DST, #15 beq 164f - rsb WK0, DST, #0 /* bits 0-4 = number of leading bytes until destination aligned */ + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ leading_15bytes process_head, process_tail -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman