Re: [Pixman] [PATCH] ARM: NEON: optimization for bilinear scaled 'over 8888 8888'

Taekyun Kim Wed, 16 Mar 2011 00:33:28 -0700

Hi,

I'm sorry about that I have made some mistakes in previous patch.
I have mistaken that q4~q7 registers are available for my functions.
Now it passes pixman scaling tests.


Performance Benchmark Result on ARM Cortex-A8 (scaling-bench)
  before : transl: op=3, src=20028888, mask=- dst=20028888, speed=5.58
MPix/s
  after :   transl: op=3, src=20028888, mask=- dst=20028888, speed=37.84
MPix/s

  performance of nearest scaling over for comparison
              transl: op=3, src=20028888, mask=- dst=20028888, speed=60.73
MPix/s

  performance of bilinear scaling src for comparison
              transl: op=1, src=20028888, mask=- dst=20028888, speed=65.47
MPix/s



On Tue, Mar 15, 2011 at 11:02 AM, Taekyun Kim <podai...@gmail.com> wrote:
>
> Hi, it's nice to see that you keep looking into improving bilinear
> scaling performance for pixman. I just wonder if you have totally
> given up on non-NEON bilinear optimizations by now? My understanding
> was that this was the area which you originally tried to work on.
>

I have to consider many platforms with or without SIMD.
Non-NEON bilinear optimizations are still in my concern.
But the priority has changed temporarily for some reasons.


> Also a bit tricky part is that I'm also still working on more pixman
> ARM NEON optimizations and I'm about to submit two additional bilinear
> performance optimizations patchsets, one of them unfortunately
> clashing with your patch. Not to mention that NEON optimized
> 'over_8888_8888' and 'over_8888_565' with bilinear scaled source are
> also part of my plan, even though they are not immediately available
> as of today.
>

I just needed some performance data immediately at that time
and I'm waiting your patches for other bilinear operations to be released
:-)


> There are two pipeline stalls here on ARM Cortex-A8/A9. Most of NEON
> instructions have latency higher than 1 and you can't use the result
> of one instruction immediately in the next cycle without suffering
> from performance penalty. A simple reordering of instructions resolves
> the problem easily at least for this case:
>
> vuzp.8 d0, d1
> vuzp.8 d2, d3
> vuzp.8 d0, d1
> vuzp.8 d2, d3
>
> And unfortunately here we have really a lot of pipeline stalls which
> are a bit difficult to hide. This all does not make your solution bad,
> and it indeed should provide a really good speedup over C code. But it
> surely can be done a bit better.


I cannot find proper reordering to avoid pipeline stalls in blending and
interleaving.
The destination registers will be available at N6 or N4 cycle for vmul,
vadd, vqadd instructions.
In the case of four pixels, it seems hard to avoid pipeline stalls.
I think combining eight pixels at once will be more suitable for SW
pipelining.
And I also expect that proper prefeching and aligned write will
significantly increase the performance.

I hope to see your patches soon.
And please leave some comments on my patch.

Thank you.

-- 
Best Regards,
Taekyun Kim

From 22a3fb829fb49c98d789f417560aee0c53c92877 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq....@samsung.com>
Date: Wed, 16 Mar 2011 14:03:21 +0900
Subject: [PATCH] ARM: NEON optimizations for bilinear sclaed 'over_8888_8888'

vpush/vpop to avoid corruptions in registers used by callee.
Minimize the number of registers used.
Tuned destination pixel prefetch distance to 16 pixels.
Minimize pipeline stalls.
---
 pixman/pixman-arm-neon-asm.S |  101 +++++++++++++++++++++++++++++++++++++-----
 pixman/pixman-arm-neon.c     |    4 ++
 2 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 71b30ac..c03576d 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2554,7 +2554,79 @@ fname:
 .endif
 .endm
 
-.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+/*
+ * Combine functions are called just before writing results to memory.
+ * Assume that source pixels are located in d0, d1 registers
+ * in a8r8g8b8 format.
+ * Combine functions may use registers d2~d31 and overwrite result
+ * on d0, d1 registers.
+ * TODO: 0565 format, optimization for 2 and 1 pixel case
+ * TODO: Fix hard-coded prefetch distance
+ */
+
+/* Dummy combine function for operator SRC */
+.macro bilinear_combine_src dst_fmt, numpix
+.endm
+
+/* Destination pixel load functions for bilinear_combine_XXXX */
+.macro bilinear_load_dst_8888 numpix
+.if numpix == 4
+	pld			[OUT, #64]
+	vld1.32		{d2, d3}, [OUT]
+.elseif numpix == 2
+	vld1.32		{d2}, [OUT]
+.elseif numpix == 1
+	vld1.32		{d2[0]}, [OUT]
+.else
+	.error bilinear_load_dst_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_load_dst_0565 numpix
+.if numpix == 4
+.elseif numpix == 2
+.elseif numpix == 1
+.else
+	.error bilinear_load_dst_0565 numpix is unsupported
+.endif
+.endm
+
+/* Combine function for operator OVER */
+.macro bilinear_combine_over dst_fmt, numpix
+	vpush		{ q4, q5 }
+
+	bilinear_load_dst_&dst_fmt numpix
+	/* Deinterleave source & destination */
+	vuzp.8		d0, d1
+	vuzp.8		d2, d3
+	vuzp.8		d0, d1
+	vuzp.8		d2, d3
+	
+	/* invert source alpha */
+	vdup.32		d4, d1[1]
+	vmvn.8		d4, d4
+
+	/* result = dst*(256 - srcA) */
+	vmull.u8	q4, d2, d4
+	vmull.u8	q5, d3, d4
+
+	vrshr.u16	q2, q4, #8
+	vrshr.u16	q3, q5, #8
+
+	vraddhn.u16	d2, q2, q4
+	vraddhn.u16	d3, q3, q5
+
+	/* result += src (premultiplied) */
+	vqadd.u8	q0, q1, q0
+
+	/* Interleave (rrrr, gggg, bbbb, aaaa) into (rgba, rgba, rgba, rgba) */
+	vuzp.8		d0, d1
+	vuzp.8		d0, d1
+
+	vpop		{ q4, q5 }
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt, op
     bilinear_load_&src_fmt d0, d1, d2
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
@@ -2568,10 +2640,11 @@ fname:
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
+	bilinear_combine_&op dst_fmt, 1
     bilinear_store_&dst_fmt 1, q2, q3
 .endm
 
-.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt, op
     bilinear_load_and_vertical_interpolate_two_&src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
     vshr.u16  q15, q12, #8
@@ -2585,10 +2658,11 @@ fname:
     vshrn.u32 d30, q0, #16
     vshrn.u32 d31, q10, #16
     vmovn.u16 d0, q15
+	bilinear_combine_&op dst_fmt, 2
     bilinear_store_&dst_fmt 2, q2, q3
 .endm
 
-.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt, op
     bilinear_load_and_vertical_interpolate_four_&src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
@@ -2616,6 +2690,7 @@ fname:
     vshrn.u32 d5, q8, #16
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
+	bilinear_combine_&op dst_fmt, 4
     bilinear_store_&dst_fmt 4, q2, q3
 .endm
 
@@ -2635,7 +2710,7 @@ fname:
  *                      pixels ahead
  */
 
-.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, op, \
                                        bpp_shift, prefetch_distance
 
 pixman_asm_function fname
@@ -2673,17 +2748,17 @@ pixman_asm_function fname
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
 0:
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt, op
     subs      WIDTH, WIDTH, #4
     bge       0b
 1:
     tst       WIDTH, #2
     beq       2f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt, op
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt, op
 3:
     pop       {r4, r5, r6, r7, r8, r9}
     bx        lr
@@ -2706,13 +2781,17 @@ pixman_asm_function fname
 .endm
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, src, 2, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, src, 2, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, src, 1, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, src, 1, 28
+
+generate_bilinear_scanline_func \
+	pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, 8888, 8888, over, 2, 28
+
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 0a10ca1..ed9c839 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -136,6 +136,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
                                          uint16_t, uint16_t)
 
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+										uint32_t, uint32_t)
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
@@ -362,6 +364,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
 
+	SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.0.4

_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

Re: [Pixman] [PATCH] ARM: NEON: optimization for bilinear scaled 'over 8888 8888'

Reply via email to