This is tuned for the Cortex-A7 (Raspberry Pi 2). lowlevel-blt-bench results, compared to the ARMv6 fast path:
Before After Mean StdDev Mean StdDev Confidence Change L1 146.0 0.7 231.4 1.2 100.0% +58.5% L2 143.1 0.9 222.1 1.7 100.0% +55.3% M 110.9 0.0 129.0 0.5 100.0% +16.3% HT 57.3 0.6 73.0 0.3 100.0% +27.4% VT 46.6 0.5 61.6 0.4 100.0% +32.3% R 42.3 0.2 51.7 0.2 100.0% +22.2% RT 19.1 0.1 21.0 0.1 100.0% +9.9% --- pixman/pixman-arm-neon-asm.S | 35 +++++++++++++++++++++++++++++++++++ pixman/pixman-arm-neon.c | 4 ++++ 2 files changed, 39 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 7e949a3..2fecb5b 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -2757,6 +2757,41 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_in_8888_8_process_pixblock_head + /* src is in d0-d3 (deinterleaved) */ + /* destination pixel data is in d4 */ + vmull.u8 q8, d3, d4 +.endm + +.macro pixman_composite_in_8888_8_process_pixblock_tail + vrshr.u16 q9, q8, #8 + vraddhn.u16 d28, q8, q9 + /* result is in d28 */ +.endm + +.macro pixman_composite_in_8888_8_process_pixblock_tail_head + vld4.8 {d0-d3}, [SRC]! + vrshr.u16 q9, q8, #8 + vld1.8 {d4}, [DST_R :64]! + cache_preload 8, 8 + vraddhn.u16 d28, q8, q9 + vmull.u8 q8, d3, d4 + vst1.8 {d28}, [DST_W :64]! +.endm + +generate_composite_function \ + pixman_composite_in_8888_8_asm_neon, 32, 0, 8, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 4, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_in_8888_8_process_pixblock_head, \ + pixman_composite_in_8888_8_process_pixblock_tail, \ + pixman_composite_in_8888_8_process_pixblock_tail_head + +/******************************************************************************/ + generate_composite_function_nearest_scanline \ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 2f669cb..52ee9a4 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -66,6 +66,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565, uint8_t, 1, uint16_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, in_8888_8, + uint32_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565, uint16_t, 1) @@ -372,6 +374,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8), + PIXMAN_STD_FAST_PATH (IN, a8r8g8b8, null, a8, neon_composite_in_8888_8), + PIXMAN_STD_FAST_PATH (IN, a8b8g8r8, null, a8, neon_composite_in_8888_8), PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888), PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888), PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565), -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman