Pushed to master. Thanks, Søren
Pekka Paalanen <ppaala...@gmail.com> writes: > From: Ben Avison <bavi...@riscosopen.org> > > Benchmark results, "before" is upstream/master > 5f661ee719be25c3aa0eb0d45e0db23a37e76468, and "after" contains this > patch on top. > > lowlevel-blt-bench, src_8888_0565, 100 iterations: > > Before After > Mean StdDev Mean StdDev Confidence Change > L1 25.9 0.20 115.6 0.70 100.00% +347.1% > L2 14.4 0.23 52.7 3.48 100.00% +265.0% > M 14.1 0.01 79.8 0.17 100.00% +465.9% > HT 10.2 0.03 32.9 0.31 100.00% +221.2% > VT 9.8 0.03 29.8 0.25 100.00% +203.4% > R 9.4 0.03 27.8 0.18 100.00% +194.7% > RT 4.6 0.04 10.9 0.29 100.00% +135.9% > > At most 19 outliers rejected per test per set. > > cairo-perf-trace with trimmed traces results were indifferent. > > A system-wide perf_3.10 profile on Raspbian shows significant > differences in the X server CPU usage. The following were measured from > a 130x62 char lxterminal running 'dmesg' every 0.5 seconds for roughly > 30 seconds. These profiles are libpixman.so symbols only. > > Before: > > Samples: 63K of event 'cpu-clock', Event count (approx.): 2941348112, DSO: > libpixman-1.so.0.33.1 > 37.77% Xorg [.] fast_fetch_r5g6b5 > 14.39% Xorg [.] pixman_composite_over_n_8_8888_asm_armv6 > 8.51% Xorg [.] fast_write_back_r5g6b5 > 7.38% Xorg [.] pixman_composite_src_8888_8888_asm_armv6 > 4.39% Xorg [.] pixman_composite_add_8_8_asm_armv6 > 3.69% Xorg [.] pixman_composite_src_n_8888_asm_armv6 > 2.53% Xorg [.] _pixman_image_validate > 2.35% Xorg [.] pixman_image_composite32 > > After: > > Samples: 31K of event 'cpu-clock', Event count (approx.): 3619782704, DSO: > libpixman-1.so.0.33.1 > 22.36% Xorg [.] pixman_composite_over_n_8_8888_asm_armv6 > 13.59% Xorg [.] pixman_composite_src_x888_0565_asm_armv6 > 12.75% Xorg [.] pixman_composite_src_8888_8888_asm_armv6 > 6.79% Xorg [.] pixman_composite_add_8_8_asm_armv6 > 5.95% Xorg [.] pixman_composite_src_n_8888_asm_armv6 > 4.12% Xorg [.] pixman_image_composite32 > 3.69% Xorg [.] _pixman_image_validate > 3.65% Xorg [.] _pixman_bits_image_setup_accessors > > Before, fast_fetch_r5g6b5 + fast_write_back_r5g6b5 took 46% of the > samples in libpixman, and probably incurred some memcpy() load, too. > After, pixman_composite_src_x888_0565_asm_armv6 takes 14%. Note, that > the sample counts are very different before/after, as less time is spent > in Pixman and running time is not exactly the same. > > Furthermore, in the above test, the CPU idle function was sampled 9% > before, and 15% after. > > v4, Pekka Paalanen <pekka.paala...@collabora.co.uk> : > Re-benchmarked on Raspberry Pi, commit message. > --- > pixman/pixman-arm-simd-asm.S | 77 > ++++++++++++++++++++++++++++++++++++++++++++ > pixman/pixman-arm-simd.c | 7 ++++ > 2 files changed, 84 insertions(+) > > diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S > index bc02ebb..7b0727b 100644 > --- a/pixman/pixman-arm-simd-asm.S > +++ b/pixman/pixman-arm-simd-asm.S > @@ -304,6 +304,83 @@ generate_composite_function \ > > > /******************************************************************************/ > > +.macro src_x888_0565_init > + /* Hold loop invariant in MASK */ > + ldr MASK, =0x001F001F > + line_saved_regs STRIDE_S, ORIG_W > +.endm > + > +.macro src_x888_0565_1pixel s, d > + and WK&d, MASK, WK&s, lsr #3 @ > 00000000000rrrrr00000000000bbbbb > + and STRIDE_S, WK&s, #0xFC00 @ > 0000000000000000gggggg0000000000 > + orr WK&d, WK&d, WK&d, lsr #5 @ > 00000000000-----rrrrr000000bbbbb > + orr WK&d, WK&d, STRIDE_S, lsr #5 @ > 00000000000-----rrrrrggggggbbbbb > + /* Top 16 bits are discarded during the following STRH */ > +.endm > + > +.macro src_x888_0565_2pixels slo, shi, d, tmp > + and SCRATCH, WK&shi, #0xFC00 @ > 0000000000000000GGGGGG0000000000 > + and WK&tmp, MASK, WK&shi, lsr #3 @ > 00000000000RRRRR00000000000BBBBB > + and WK&shi, MASK, WK&slo, lsr #3 @ > 00000000000rrrrr00000000000bbbbb > + orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ > 00000000000-----RRRRR000000BBBBB > + orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ > 00000000000-----RRRRRGGGGGGBBBBB > + and SCRATCH, WK&slo, #0xFC00 @ > 0000000000000000gggggg0000000000 > + orr WK&shi, WK&shi, WK&shi, lsr #5 @ > 00000000000-----rrrrr000000bbbbb > + orr WK&shi, WK&shi, SCRATCH, lsr #5 @ > 00000000000-----rrrrrggggggbbbbb > + pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ > RRRRRGGGGGGBBBBBrrrrrggggggbbbbb > +.endm > + > +.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, > unaligned_mask, preload > + WK4 .req STRIDE_S > + WK5 .req STRIDE_M > + WK6 .req WK3 > + WK7 .req ORIG_W > + .if numbytes == 16 > + pixld , 16, 4, SRC, 0 > + src_x888_0565_2pixels 4, 5, 0, 0 > + pixld , 8, 4, SRC, 0 > + src_x888_0565_2pixels 6, 7, 1, 1 > + pixld , 8, 6, SRC, 0 > + .else > + pixld , numbytes*2, 4, SRC, 0 > + .endif > +.endm > + > +.macro src_x888_0565_process_tail cond, numbytes, firstreg > + .if numbytes == 16 > + src_x888_0565_2pixels 4, 5, 2, 2 > + src_x888_0565_2pixels 6, 7, 3, 4 > + .elseif numbytes == 8 > + src_x888_0565_2pixels 4, 5, 1, 1 > + src_x888_0565_2pixels 6, 7, 2, 2 > + .elseif numbytes == 4 > + src_x888_0565_2pixels 4, 5, 1, 1 > + .else > + src_x888_0565_1pixel 4, 1 > + .endif > + .if numbytes == 16 > + pixst , numbytes, 0, DST > + .else > + pixst , numbytes, 1, DST > + .endif > + .unreq WK4 > + .unreq WK5 > + .unreq WK6 > + .unreq WK7 > +.endm > + > +generate_composite_function \ > + pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ > + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | > FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ > + 3, /* prefetch distance */ \ > + src_x888_0565_init, \ > + nop_macro, /* newline */ \ > + nop_macro, /* cleanup */ \ > + src_x888_0565_process_head, \ > + src_x888_0565_process_tail > + > +/******************************************************************************/ > + > .macro add_8_8_8pixels cond, dst1, dst2 > uqadd8&cond WK&dst1, WK&dst1, MASK > uqadd8&cond WK&dst2, WK&dst2, STRIDE_M > diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c > index c17ce5a..fa1ab5c 100644 > --- a/pixman/pixman-arm-simd.c > +++ b/pixman/pixman-arm-simd.c > @@ -41,6 +41,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8, > uint8_t, 1, uint8_t, 1) > PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888, > uint16_t, 1, uint32_t, 1) > +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_0565, > + uint32_t, 1, uint16_t, 1) > > PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, > uint8_t, 1, uint8_t, 1) > @@ -224,6 +226,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = > PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, > armv6_composite_src_0565_8888), > PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, > armv6_composite_src_0565_8888), > > + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, > armv6_composite_src_x888_0565), > + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, > armv6_composite_src_x888_0565), > + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, > armv6_composite_src_x888_0565), > + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, > armv6_composite_src_x888_0565), > + > PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, > armv6_composite_over_8888_8888), > PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, > armv6_composite_over_8888_8888), > PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, > armv6_composite_over_8888_8888), _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman