From: Nemanja Lukic nemanja.lu...@rt-rk.com
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
src_0888__rev = L1: 51.88 L2: 42.00 M: 19.04 ( 88.50%) HT:
15.27 VT: 14.62 R: 14.13 RT: 7.12 ( 45Kops/s)
src_0888_0565_rev = L1: 31.96 L2: 30.90 M: 22.60 ( 75.03%) HT:
15.32 VT: 15.11 R: 14.49 RT: 6.64 ( 43Kops/s)
Optimized:
src_0888__rev = L1: 222.73 L2: 113.70 M: 20.97 ( 97.35%) HT:
18.31 VT: 17.14 R: 16.71 RT: 9.74 ( 54Kops/s)
src_0888_0565_rev = L1: 100.37 L2: 74.27 M: 29.43 ( 97.63%) HT:
22.92 VT: 21.59 R: 20.52 RT: 10.56 ( 56Kops/s)
---
pixman/pixman-mips-dspr2-asm.S | 389
pixman/pixman-mips-dspr2.c | 10 +
2 files changed, 399 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 299f739..3adbb2a 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -310,6 +310,395 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888__asm_mips)
END(pixman_composite_src_x888__asm_mips)
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) ||
defined(MIPSEL)
+LEAF_MIPS_DSPR2(pixman_composite_src_0888__rev_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+beqz a2, 6f
+ nop
+
+lui t8, 0xff00;
+srl t9, a2, 2 /* t9 = how many multiples of 4 src pixels */
+beqz t9, 4f /* branch if less than 4 src pixels */
+ nop
+
+lit0, 0x1
+lit1, 0x2
+lit2, 0x3
+andi t3, a1, 0x3
+beq t3, t0, 1f
+ nop
+beq t3, t1, 2f
+ nop
+beq t3, t2, 3f
+ nop
+
+0:
+beqz t9, 4f
+ addiut9, t9, -1
+lwt0, 0(a1)/* t0 = R2 | B1 | G1 | R1 */
+lwt1, 4(a1)/* t1 = G3 | R3 | B2 | G2 */
+lwt2, 8(a1)/* t2 = B4 | G4 | R4 | B3 */
+
+addiu a1, a1, 12
+addiu a2, a2, -4
+
+wsbh t0, t0 /* t0 = B1 | R2 | R1 | G1 */
+wsbh t1, t1 /* t1 = R3 | G3 | G2 | B2 */
+wsbh t2, t2 /* t2 = G4 | B4 | B3 | R4 */
+
+packrl.ph t3, t1, t0 /* t3 = G2 | B2 | B1 | R2 */
+packrl.ph t4, t0, t0 /* t4 = R1 | G1 | B1 | R2 */
+rotr t3, t3, 16 /* t3 = B1 | R2 | G2 | B2 */
+ort3, t3, t8 /* t3 = FF | R2 | G2 | B2 */
+srl t4, t4, 8/* t4 = 0 | R1 | G1 | B1 */
+ort4, t4, t8 /* t4 = FF | R1 | G1 | B1 */
+packrl.ph t5, t2, t1 /* t5 = B3 | R4 | R3 | G3 */
+rotr t5, t5, 24 /* t5 = R4 | R3 | G3 | B3 */
+ort5, t5, t8 /* t5 = FF | R3 | G3 | B3 */
+rotr t2, t2, 16 /* t2 = B3 | R4 | G4 | B4 */
+ort2, t2, t8 /* t5 = FF | R3 | G3 | B3 */
+
+swt4, 0(a0)
+swt3, 4(a0)
+swt5, 8(a0)
+swt2, 12(a0)
+b 0b
+ addiua0, a0, 16
+
+1:
+lbu t6, 0(a1)/* t6 = 0 | 0 | 0 | R1 */
+lhu t7, 1(a1)/* t7 = 0 | 0 | B1 | G1 */
+sll t6, t6, 16 /* t6 = 0 | R1 | 0 | 0 */
+wsbh t7, t7 /* t7 = 0 | 0 | G1 | B1 */
+ort7, t6, t7 /* t7 = 0 | R1 | G1 | B1 */
+11:
+beqz t9, 4f
+ addiut9, t9, -1
+lwt0, 3(a1)/* t0 = R3 | B2 | G2 | R2 */
+lwt1, 7(a1)/* t1 = G4 | R4 | B3 | G3 */
+lwt2, 11(a1) /* t2 = B5 | G5 | R5 | B4 */
+
+addiu a1, a1, 12
+addiu a2, a2, -4
+
+wsbh t0, t0 /* t0 = B2 | R3 | R2 | G2 */
+wsbh t1, t1 /* t1 = R4 | G4 | G3 | B3 */
+wsbh t2, t2 /* t2 = G5 | B5 | B4 | R5 */
+
+packrl.ph t3, t1, t0 /* t3 = G3 | B3 | B2 | R3 */
+packrl.ph t4, t2, t1 /* t4 = B4 | R5 | R4 | G4 */
+rotr t0, t0, 24 /* t0 = R3 | R2 | G2 | B2 */
+rotr t3, t3, 16 /* t3 = B2 | R3 | G3 | B3 */
+rotr t4, t4, 24 /* t4 = R5 | R4 | G4 | B4 */
+ort7, t7, t8 /* t7 = FF | R1 | G1 | B1 */
+ort0, t0, t8 /* t0 = FF | R2 | G2 | B2 */
+ort3, t3, t8 /* t1 = FF | R3 | G3 | B3 */