From: Nemanja Lukic <nemanja.lu...@rt-rk.com> Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results Referent (before): over_n_0565 = L1: 12.04 L2: 21.45 M: 18.50 ( 24.55%) HT: 6.93 VT: 6.45 R: 6.38 RT: 2.16 ( 22Kops/s) over_n_8888 = L1: 93.76 L2: 85.96 M: 24.41 ( 64.78%) HT: 8.93 VT: 8.08 R: 7.99 RT: 2.54 ( 25Kops/s) Optimized: over_n_0565 = L1: 26.19 L2: 24.93 M: 21.28 ( 28.25%) HT: 18.75 VT: 18.07 R: 17.85 RT: 11.34 ( 57Kops/s) over_n_8888 = L1: 55.31 L2: 49.07 M: 28.60 ( 75.93%) HT: 23.99 VT: 22.95 R: 22.34 RT: 12.85 ( 61Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 106 ++++++++++++++++++++++++++++++++++++++++ pixman/pixman-mips-dspr2-asm.h | 31 ++++++++++++ pixman/pixman-mips-dspr2.c | 9 +++- pixman/pixman-mips-dspr2.h | 36 ++++++++++++++ 4 files changed, 181 insertions(+), 1 deletions(-) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index d2482e0..96d3f97 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -1342,6 +1342,112 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips) END(pixman_composite_over_8888_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (32bit constant) + * a2 - w + */ + + SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 + li t2, 0x00ff00ff + li t5, 0xf800f800 + li t6, 0x07e007e0 + li t7, 0x001F001F + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop +1: + /* a1 = source (32bit constant) */ + lhu t0, 0(a0) /* t0 = destination (r5g6b5) */ + /* a1 = source (32bit constant) */ + lhu t1, 2(a0) /* t1 = destination (r5g6b5) */ + + CONVERT_2x0565_TO_2x8888 t0, t1, t3, t4, t6, t7, t8, t9, s0, s1 + OVER_2x8888_2x8888 a1, a1, t3, t4, t0, t1, t2, t8, t9, s0, s1, s2, s3 + CONVERT_2x8888_TO_2x0565 t0, t1, t3, t4, t5, t6, t7, t8, t9 + + sh t3, 0(a0) + sh t4, 2(a0) + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a2, 3f + nop + /* a1 = source (32bit constant) */ + lhu t0, 0(a0) /* t0 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t0, t1, t3, t4 + OVER_8888_8888 a1, t1, t0, t2, t3, t4, t5, t6 + CONVERT_1x8888_TO_1x0565 t0, t1, t3, t4 + + sh t1, 0(a0) +3: + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + j ra + nop + +END(pixman_composite_over_n_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - w + */ + + beqz a2, 4f + nop + + SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5 + li t4, 0x00ff00ff + srl t9, a2, 2 /* t9 = how many multiples of 4 dst pixels */ + beqz t9, 2f /* branch if less than 4 src pixels */ + nop + +1: + beqz t9, 2f + addiu t9, t9, -1 + + lw t0, 0(a0) /* t0 = destination (a8r8g8b8) */ + lw t1, 4(a0) /* t1 = destination (a8r8g8b8) */ + lw s0, 8(a0) /* s0 = destination (a8r8g8b8) */ + lw s1, 12(a0) /* s1 = destination (a8r8g8b8) */ + + OVER_2x8888_2x8888 a1, a1, t0, t1, t2, t3, t4, t5, t6, t7, t8, s4, s5 + OVER_2x8888_2x8888 a1, a1, s0, s1, s2, s3, t4, t5, t6, t7, t8, s4, s5 + + sw t2, 0(a0) + sw t3, 4(a0) + sw s2, 8(a0) + sw s3, 12(a0) + addiu a2, a2, -4 + b 1b + addiu a0, a0, 16 +2: + beqz a2, 3f + nop +21: + lw t0, 0(a0) /* t0 = destination (a8r8g8b8) */ + + OVER_8888_8888 a1, t0, t1, t4, t3, t2, t5, t6 + + sw t1, 0(a0) + addiu a2, a2, -1 + bnez a2, 21b + addiu a0, a0, 4 +3: + RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 +4: + j ra + nop + +END(pixman_composite_over_n_8888_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips) /* * a0 - dst (a8) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h index b330c0f..dcbac0b 100644 --- a/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman-mips-dspr2-asm.h @@ -587,6 +587,37 @@ LEAF_MIPS32R2(symbol) \ addu_s.qb \out_8888, \out_8888, \s_8888 .endm +/* + * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two + * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro OVER_2x8888_2x8888 s1_8888, \ + s2_8888, \ + d1_8888, \ + d2_8888, \ + out1_8888, \ + out2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + not \scratch1, \s1_8888 + srl \scratch1, \scratch1, 24 + not \scratch2, \s2_8888 + srl \scratch2, \scratch2, 24 + + MIPS_2xUN8x4_MUL_2xUN8 \d1_8888, \d2_8888, \ + \scratch1, \scratch2, \ + \out1_8888, \out2_8888, \ + \maskLSR, \ + \scratch3, \scratch4, \scratch5, \ + \scratch6, \d1_8888, \d2_8888 + + addu_s.qb \out1_8888, \out1_8888, \s1_8888 + addu_s.qb \out2_8888, \out2_8888, \s2_8888 +.endm + .macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888, \ m_8, \ d_8888, \ diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index 161377b..11f1254 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -81,6 +81,11 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565, PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_0565, + uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_8888, + uint32_t, 1) + PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t, 1, uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1, @@ -283,7 +288,9 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565), - + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mips_composite_over_n_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mips_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mips_composite_over_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mips_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mips_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, mips_composite_over_8888_n_0565), diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h index 3766850..4ac9ff9 100644 --- a/pixman/pixman-mips-dspr2.h +++ b/pixman/pixman-mips-dspr2.h @@ -85,6 +85,42 @@ mips_composite_##name (pixman_implementation_t *imp, \ } \ } +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_DST(flags, name, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + uint32_t src, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + int32_t dst_stride; \ + uint32_t src; \ + \ + src = _pixman_image_get_solid ( \ + imp, src_image, dest_image->bits.format); \ + \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ + return; \ + \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + \ + pixman_composite_##name##_asm_mips (dst, src, width); \ + } \ +} + /*******************************************************************/ #define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \ -- 1.7.3 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman