On Fri, Mar 25, 2016 at 11:46 PM, Matthieu Bouron <matthieu.bou...@gmail.com > wrote:
> From: Matthieu Bouron <matthieu.bou...@stupeflix.com> > > --- > libswscale/arm/yuv2rgb_neon.S | 154 > +++++++++++++++++++----------------------- > 1 file changed, 69 insertions(+), 85 deletions(-) > Patch updated (resolve a conflict with the updated version of patch 06/10).
From d06a5437f9042e0b350556e9642d52866284e7a8 Mon Sep 17 00:00:00 2001 From: Matthieu Bouron <matthieu.bou...@stupeflix.com> Date: Wed, 23 Mar 2016 14:10:45 +0000 Subject: [PATCH 08/10] swscale/arm/yuv2rgb: re-organize the code like its aarch64 counter part --- libswscale/arm/yuv2rgb_neon.S | 154 +++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 85 deletions(-) diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S index 6279637..6a15778 100644 --- a/libswscale/arm/yuv2rgb_neon.S +++ b/libswscale/arm/yuv2rgb_neon.S @@ -21,90 +21,6 @@ #include "libavutil/arm/asm.S" - -.macro compute_premult half_u1, half_u2, half_v1, half_v2 - vmov d2, \half_u1 @ copy left q14 to left q1 - vmov d3, \half_u1 @ copy left q14 to right q1 - vmov d4, \half_u2 @ copy right q14 to left q2 - vmov d5, \half_u2 @ copy right q14 to right q2 - - vmov d6, \half_v1 @ copy left q15 to left q3 - vmov d7, \half_v1 @ copy left q15 to right q3 - vmov d8, \half_v2 @ copy right q15 to left q4 - vmov d9, \half_v2 @ copy right q15 to right q4 - - vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4 - vzip.16 d4, d5 @ U5U5U6U6U7U7U8U8 - - vzip.16 d6, d7 @ V1V1V2V2V3V3V4V4 - vzip.16 d8, d9 @ V5V5V6V6V7V7V8V8 - - vmul.s16 q8, q3, d1[0] @ V * v2r (left, red) - vmul.s16 q9, q4, d1[0] @ V * v2r (right, red) - vmul.s16 q10, q1, d1[1] @ U * u2g - vmul.s16 q11, q2, d1[1] @ U * u2g - vmla.s16 q10, q3, d1[2] @ U * u2g + V * v2g (left, green) - vmla.s16 q11, q4, d1[2] @ U * u2g + V * v2g (right, green) - vmul.s16 q12, q1, d1[3] @ U * u2b (left, blue) - vmul.s16 q13, q2, d1[3] @ U * u2b (right, blue) -.endm - -.macro compute_color dst_comp1 dst_comp2 pre1 pre2 - vadd.s16 q1, q14, \pre1 - vadd.s16 q2, q15, \pre2 - vqrshrun.s16 \dst_comp1, q1, #6 - vqrshrun.s16 \dst_comp2, q2, #6 -.endm - -.macro compute_rgba r1 r2 g1 g2 b1 b2 a1 a2 - compute_color \r1, \r2, q8, q9 - compute_color \g1, \g2, q10, q11 - compute_color \b1, \b2, q12, q13 - vmov.u8 \a1, #255 - vmov.u8 \a2, #255 -.endm - -.macro compute_16px dst y0 y1 ofmt - vmovl.u8 q14, \y0 @ 8px of y - vmovl.u8 q15, \y1 @ 8px of y - - vdup.16 q5, r9 @ q5 = y_offset - vmov d14, d0 @ q7 = y_coeff - vmov d15, d0 @ q7 = y_coeff - - vsub.s16 q14, q5 - vsub.s16 q15, q5 - - vmul.s16 q14, q7 @ q14 = (srcY - y_offset) * y_coeff (left) - vmul.s16 q15, q7 @ q15 = (srcY - y_offset) * y_coeff (right) - - -.ifc \ofmt,argb - compute_rgba d7, d11, d8, d12, d9, d13, d6, d10 -.endif - -.ifc \ofmt,rgba - compute_rgba d6, d10, d7, d11, d8, d12, d9, d13 -.endif - -.ifc \ofmt,abgr - compute_rgba d9, d13, d8, d12, d7, d11, d6, d10 -.endif - -.ifc \ofmt,bgra - compute_rgba d8, d12, d7, d11, d6, d10, d9, d13 -.endif - vst4.8 {q3, q4}, [\dst,:128]! - vst4.8 {q5, q6}, [\dst,:128]! - -.endm - -.macro process_1l_16px ofmt - compute_premult d28, d29, d30, d31 - vld1.8 {q7}, [r4]! - compute_16px r2, d14, d15, \ofmt -.endm - .macro load_args_nv12 push {r4-r12, lr} vpush {q4-q7} @@ -200,6 +116,21 @@ add r10,r10,r12 @ srcV += paddingV .endm +.macro compute_color dst_comp1 dst_comp2 pre1 pre2 + vadd.s16 q1, q14, \pre1 + vadd.s16 q2, q15, \pre2 + vqrshrun.s16 \dst_comp1, q1, #6 + vqrshrun.s16 \dst_comp2, q2, #6 +.endm + +.macro compute_rgba r1 r2 g1 g2 b1 b2 a1 a2 + compute_color \r1, \r2, q8, q9 + compute_color \g1, \g2, q10, q11 + compute_color \b1, \b2, q12, q13 + vmov.u8 \a1, #255 + vmov.u8 \a2, #255 +.endm + .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 load_args_\ifmt @@ -210,7 +141,60 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 pld [r4, #64*3] vmov.i8 d10, #128 load_chroma_\ifmt - process_1l_16px \ofmt + vmov d2, d28 @ copy left q14 to left q1 + vmov d3, d28 @ copy left q14 to right q1 + vmov d4, d29 @ copy right q14 to left q2 + vmov d5, d29 @ copy right q14 to right q2 + + vmov d6, d30 @ copy left q15 to left q3 + vmov d7, d30 @ copy left q15 to right q3 + vmov d8, d31 @ copy right q15 to left q4 + vmov d9, d31 @ copy right q15 to right q4 + + vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4 + vzip.16 d4, d5 @ U5U5U6U6U7U7U8U8 + + vzip.16 d6, d7 @ V1V1V2V2V3V3V4V4 + vzip.16 d8, d9 @ V5V5V6V6V7V7V8V8 + + vmul.s16 q8, q3, d1[0] @ V * v2r (left, red) + vmul.s16 q9, q4, d1[0] @ V * v2r (right, red) + vmul.s16 q10, q1, d1[1] @ U * u2g + vmul.s16 q11, q2, d1[1] @ U * u2g + vmla.s16 q10, q3, d1[2] @ U * u2g + V * v2g (left, green) + vmla.s16 q11, q4, d1[2] @ U * u2g + V * v2g (right, green) + vmul.s16 q12, q1, d1[3] @ U * u2b (left, blue) + vmul.s16 q13, q2, d1[3] @ U * u2b (right, blue) + + vld1.8 {q7}, [r4]! + vmovl.u8 q14, d14 @ 8px of y + vmovl.u8 q15, d15 @ 8px of y + vdup.16 q5, r9 @ q5 = y_offset + vmov d14, d0 @ q7 = y_coeff + vmov d15, d0 @ q7 = y_coeff + vsub.s16 q14, q5 + vsub.s16 q15, q5 + vmul.s16 q14, q7 @ q14 = (srcY - y_offset) * y_coeff (left) + vmul.s16 q15, q7 @ q15 = (srcY - y_offset) * y_coeff (right) + +.ifc \ofmt,argb + compute_rgba d7, d11, d8, d12, d9, d13, d6, d10 +.endif + +.ifc \ofmt,rgba + compute_rgba d6, d10, d7, d11, d8, d12, d9, d13 +.endif + +.ifc \ofmt,abgr + compute_rgba d9, d13, d8, d12, d7, d11, d6, d10 +.endif + +.ifc \ofmt,bgra + compute_rgba d8, d12, d7, d11, d6, d10, d9, d13 +.endif + + vst4.8 {q3, q4}, [r2,:128]! + vst4.8 {q5, q6}, [r2,:128]! subs r8, r8, #16 @ width -= 16 bgt 2b add r2, r2, r3 @ dst += padding -- 2.7.4
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel