Hi, I'm sorry about that I have made some mistakes in previous patch. I have mistaken that q4~q7 registers are available for my functions. Now it passes pixman scaling tests.
Performance Benchmark Result on ARM Cortex-A8 (scaling-bench) before : transl: op=3, src=20028888, mask=- dst=20028888, speed=5.58 MPix/s after : transl: op=3, src=20028888, mask=- dst=20028888, speed=37.84 MPix/s performance of nearest scaling over for comparison transl: op=3, src=20028888, mask=- dst=20028888, speed=60.73 MPix/s performance of bilinear scaling src for comparison transl: op=1, src=20028888, mask=- dst=20028888, speed=65.47 MPix/s On Tue, Mar 15, 2011 at 11:02 AM, Taekyun Kim <podai...@gmail.com> wrote: > > Hi, it's nice to see that you keep looking into improving bilinear > scaling performance for pixman. I just wonder if you have totally > given up on non-NEON bilinear optimizations by now? My understanding > was that this was the area which you originally tried to work on. > I have to consider many platforms with or without SIMD. Non-NEON bilinear optimizations are still in my concern. But the priority has changed temporarily for some reasons. > Also a bit tricky part is that I'm also still working on more pixman > ARM NEON optimizations and I'm about to submit two additional bilinear > performance optimizations patchsets, one of them unfortunately > clashing with your patch. Not to mention that NEON optimized > 'over_8888_8888' and 'over_8888_565' with bilinear scaled source are > also part of my plan, even though they are not immediately available > as of today. > I just needed some performance data immediately at that time and I'm waiting your patches for other bilinear operations to be released :-) > There are two pipeline stalls here on ARM Cortex-A8/A9. Most of NEON > instructions have latency higher than 1 and you can't use the result > of one instruction immediately in the next cycle without suffering > from performance penalty. A simple reordering of instructions resolves > the problem easily at least for this case: > > vuzp.8 d0, d1 > vuzp.8 d2, d3 > vuzp.8 d0, d1 > vuzp.8 d2, d3 > > And unfortunately here we have really a lot of pipeline stalls which > are a bit difficult to hide. This all does not make your solution bad, > and it indeed should provide a really good speedup over C code. But it > surely can be done a bit better. I cannot find proper reordering to avoid pipeline stalls in blending and interleaving. The destination registers will be available at N6 or N4 cycle for vmul, vadd, vqadd instructions. In the case of four pixels, it seems hard to avoid pipeline stalls. I think combining eight pixels at once will be more suitable for SW pipelining. And I also expect that proper prefeching and aligned write will significantly increase the performance. I hope to see your patches soon. And please leave some comments on my patch. Thank you. -- Best Regards, Taekyun Kim
From 22a3fb829fb49c98d789f417560aee0c53c92877 Mon Sep 17 00:00:00 2001 From: Taekyun Kim <tkq....@samsung.com> Date: Wed, 16 Mar 2011 14:03:21 +0900 Subject: [PATCH] ARM: NEON optimizations for bilinear sclaed 'over_8888_8888' vpush/vpop to avoid corruptions in registers used by callee. Minimize the number of registers used. Tuned destination pixel prefetch distance to 16 pixels. Minimize pipeline stalls. --- pixman/pixman-arm-neon-asm.S | 101 +++++++++++++++++++++++++++++++++++++----- pixman/pixman-arm-neon.c | 4 ++ 2 files changed, 94 insertions(+), 11 deletions(-) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 71b30ac..c03576d 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -2554,7 +2554,79 @@ fname: .endif .endm -.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt +/* + * Combine functions are called just before writing results to memory. + * Assume that source pixels are located in d0, d1 registers + * in a8r8g8b8 format. + * Combine functions may use registers d2~d31 and overwrite result + * on d0, d1 registers. + * TODO: 0565 format, optimization for 2 and 1 pixel case + * TODO: Fix hard-coded prefetch distance + */ + +/* Dummy combine function for operator SRC */ +.macro bilinear_combine_src dst_fmt, numpix +.endm + +/* Destination pixel load functions for bilinear_combine_XXXX */ +.macro bilinear_load_dst_8888 numpix +.if numpix == 4 + pld [OUT, #64] + vld1.32 {d2, d3}, [OUT] +.elseif numpix == 2 + vld1.32 {d2}, [OUT] +.elseif numpix == 1 + vld1.32 {d2[0]}, [OUT] +.else + .error bilinear_load_dst_8888 numpix is unsupported +.endif +.endm + +.macro bilinear_load_dst_0565 numpix +.if numpix == 4 +.elseif numpix == 2 +.elseif numpix == 1 +.else + .error bilinear_load_dst_0565 numpix is unsupported +.endif +.endm + +/* Combine function for operator OVER */ +.macro bilinear_combine_over dst_fmt, numpix + vpush { q4, q5 } + + bilinear_load_dst_&dst_fmt numpix + /* Deinterleave source & destination */ + vuzp.8 d0, d1 + vuzp.8 d2, d3 + vuzp.8 d0, d1 + vuzp.8 d2, d3 + + /* invert source alpha */ + vdup.32 d4, d1[1] + vmvn.8 d4, d4 + + /* result = dst*(256 - srcA) */ + vmull.u8 q4, d2, d4 + vmull.u8 q5, d3, d4 + + vrshr.u16 q2, q4, #8 + vrshr.u16 q3, q5, #8 + + vraddhn.u16 d2, q2, q4 + vraddhn.u16 d3, q3, q5 + + /* result += src (premultiplied) */ + vqadd.u8 q0, q1, q0 + + /* Interleave (rrrr, gggg, bbbb, aaaa) into (rgba, rgba, rgba, rgba) */ + vuzp.8 d0, d1 + vuzp.8 d0, d1 + + vpop { q4, q5 } +.endm + +.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt, op bilinear_load_&src_fmt d0, d1, d2 vmull.u8 q1, d0, d28 vmlal.u8 q1, d1, d29 @@ -2568,10 +2640,11 @@ fname: /* 3 cycles bubble */ vmovn.u16 d0, q0 /* 1 cycle bubble */ + bilinear_combine_&op dst_fmt, 1 bilinear_store_&dst_fmt 1, q2, q3 .endm -.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt, op bilinear_load_and_vertical_interpolate_two_&src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 vshr.u16 q15, q12, #8 @@ -2585,10 +2658,11 @@ fname: vshrn.u32 d30, q0, #16 vshrn.u32 d31, q10, #16 vmovn.u16 d0, q15 + bilinear_combine_&op dst_fmt, 2 bilinear_store_&dst_fmt 2, q2, q3 .endm -.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt, op bilinear_load_and_vertical_interpolate_four_&src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 \ q3, q9, d4, d5, d16, d17, d18, d19 @@ -2616,6 +2690,7 @@ fname: vshrn.u32 d5, q8, #16 vmovn.u16 d0, q0 vmovn.u16 d1, q2 + bilinear_combine_&op dst_fmt, 4 bilinear_store_&dst_fmt 4, q2, q3 .endm @@ -2635,7 +2710,7 @@ fname: * pixels ahead */ -.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ +.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, op, \ bpp_shift, prefetch_distance pixman_asm_function fname @@ -2673,17 +2748,17 @@ pixman_asm_function fname blt 1f mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) 0: - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels src_fmt, dst_fmt, op subs WIDTH, WIDTH, #4 bge 0b 1: tst WIDTH, #2 beq 2f - bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_interpolate_two_pixels src_fmt, dst_fmt, op 2: tst WIDTH, #1 beq 3f - bilinear_interpolate_last_pixel src_fmt, dst_fmt + bilinear_interpolate_last_pixel src_fmt, dst_fmt, op 3: pop {r4, r5, r6, r7, r8, r9} bx lr @@ -2706,13 +2781,17 @@ pixman_asm_function fname .endm generate_bilinear_scanline_func \ - pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28 + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, src, 2, 28 generate_bilinear_scanline_func \ - pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28 + pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, src, 2, 28 generate_bilinear_scanline_func \ - pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28 + pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, src, 1, 28 generate_bilinear_scanline_func \ - pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28 + pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, src, 1, 28 + +generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, 8888, 8888, over, 2, 28 + diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 0a10ca1..ed9c839 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -136,6 +136,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(SKIP_ZERO_SRC, neon, 8888_8888, OVER, + uint32_t, uint32_t) void pixman_composite_src_n_8_asm_neon (int32_t w, int32_t h, @@ -362,6 +364,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888), SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), + { PIXMAN_OP_NONE }, }; -- 1.7.0.4
_______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman