On Sat, Mar 26, 2016 at 2:09 AM, Michael Niedermayer <mich...@niedermayer.cc > wrote:
> On Fri, Mar 25, 2016 at 11:46:01PM +0100, Matthieu Bouron wrote: > > From: Matthieu Bouron <matthieu.bou...@stupeflix.com> > > > > --- > > libswscale/arm/yuv2rgb_neon.S | 89 > ++++++++++++------------------------------- > > 1 file changed, 24 insertions(+), 65 deletions(-) > > breaks build > > make distclean ; ../configure --cross-prefix=/usr/arm-linux-gnueabi/bin/ > --cc='ccache arm-linux-gnueabi-gcc-4.5' --extra-cflags='-mfpu=neon > -mfloat-abi=softfp' --cpu=cortex-a8 --arch=armv7 --target-os=linux > --enable-cross-compile && make -j12 > > CC libavutil/arm/float_dsp_init_arm.o > src/libswscale/arm/yuv2rgb_neon.S: Assembler messages: > src/libswscale/arm/yuv2rgb_neon.S:269: Error: thumb conditional > instruction should be in IT block -- `subeq r6,r6,r0' > src/libswscale/arm/yuv2rgb_neon.S:269: Error: thumb conditional > instruction should be in IT block -- `addne r6,r7' > [...] Patch updated with the relevant it instructions added. It still does build on my rpi2 setup but is not tested on the same setup as yours. Can you confirm it builds/works on your setup ? If it works, i will send an updated version of the next patch (07/10) to resolve the conflicts. Matthieu
From 7b3affff405b2b483fb16f549b69ce6f21d8a946 Mon Sep 17 00:00:00 2001 From: Matthieu Bouron <matthieu.bou...@stupeflix.com> Date: Wed, 23 Mar 2016 11:26:13 +0000 Subject: [PATCH 06/10] swscale/arm/yuv2rgb: only process one line at a time for the yuv420p and nv{12,21} formats --- libswscale/arm/yuv2rgb_neon.S | 92 +++++++++++++------------------------------ 1 file changed, 27 insertions(+), 65 deletions(-) diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S index ef7b0a6..6aeccae 100644 --- a/libswscale/arm/yuv2rgb_neon.S +++ b/libswscale/arm/yuv2rgb_neon.S @@ -105,16 +105,6 @@ compute_16px r2, d14, d15, \ofmt .endm -.macro process_2l_16px ofmt - compute_premult d28, d29, d30, d31 - - vld1.8 {q7}, [r4]! @ first line of luma - compute_16px r2, d14, d15, \ofmt - - vld1.8 {q7}, [r12]! @ second line of luma - compute_16px r11, d14, d15, \ofmt -.endm - .macro load_args_nvx push {r4-r12, lr} vpush {q4-q7} @@ -127,13 +117,9 @@ ldr r10,[sp, #128] @ r10 = y_coeff vdup.16 d0, r10 @ d0 = y_coeff vld1.16 {d1}, [r8] @ d1 = *table - add r11, r2, r3 @ r11 = dst + linesize (dst2) - add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) - lsl r3, r3, #1 - lsl r5, r5, #1 - sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) - sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) - sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) + sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY - width (paddingY) + sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) .endm .macro load_args_yuv420p @@ -142,26 +128,6 @@ ldr r4, [sp, #104] @ r4 = srcY ldr r5, [sp, #108] @ r5 = linesizeY ldr r6, [sp, #112] @ r6 = srcU - ldr r8, [sp, #128] @ r8 = table - ldr r9, [sp, #132] @ r9 = y_offset - ldr r10,[sp, #136] @ r10 = y_coeff - vdup.16 d0, r10 @ d0 = y_coeff - vld1.16 {d1}, [r8] @ d1 = *table - add r11, r2, r3 @ r11 = dst + linesize (dst2) - add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) - lsl r3, r3, #1 - lsl r5, r5, #1 - sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) - sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) - ldr r10,[sp, #120] @ r10 = srcV -.endm - -.macro load_args_yuv422p - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ r4 = srcY - ldr r5, [sp, #108] @ r5 = linesizeY - ldr r6, [sp, #112] @ r6 = srcU ldr r7, [sp, #116] @ r7 = linesizeU ldr r12,[sp, #124] @ r12 = linesizeV ldr r8, [sp, #128] @ r8 = table @@ -176,6 +142,10 @@ ldr r10,[sp, #120] @ r10 = srcV .endm +.macro load_args_yuv422p + load_args_yuv420p +.endm + .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 @@ -205,35 +175,30 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 vmov.i8 d10, #128 .ifc \ifmt,nv12 - pld [r12, #64*3] - vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d2, d10 @ q14 = U - 128 vsubl.u8 q15, d3, d10 @ q15 = V - 128 - process_2l_16px \ofmt + process_1l_16px \ofmt .endif .ifc \ifmt,nv21 - pld [r12, #64*3] - vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d3, d10 @ q14 = U - 128 vsubl.u8 q15, d2, d10 @ q15 = V - 128 - process_2l_16px \ofmt + process_1l_16px \ofmt .endif .ifc \ifmt,yuv420p pld [r10, #64*3] - pld [r12, #64*3] vld1.8 d2, [r6]! @ d2: chroma red line vld1.8 d3, [r10]! @ d3: chroma blue line vsubl.u8 q14, d2, d10 @ q14 = U - 128 vsubl.u8 q15, d3, d10 @ q15 = V - 128 - process_2l_16px \ofmt + process_1l_16px \ofmt .endif .ifc \ifmt,yuv422p @@ -254,35 +219,32 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 add r4, r4, r5 @ srcY += paddingY .ifc \ifmt,nv12 - add r11, r11, r3 @ dst2 += padding - add r12, r12, r5 @ srcY2 += paddingY - - add r6, r6, r7 @ srcC += paddingC + tst r1, #1 + ite eq + subeq r6, r6, r0 @ if (height % 2 == 0) paddingU -= width + addne r6, r7 @ else paddingU += linesizeU - width - subs r1, r1, #2 @ height -= 2 + subs r1, r1, #1 @ height -= 1 .endif .ifc \ifmt,nv21 - add r11, r11, r3 @ dst2 += padding - add r12, r12, r5 @ srcY2 += paddingY + tst r1, #1 + ite eq + subeq r6, r6, r0 @ if (height % 2 == 0) paddingU -= width + addne r6, r7 @ else paddingU += linesizeU - width - add r6, r6, r7 @ srcC += paddingC - subs r1, r1, #2 @ height -= 2 + subs r1, r1, #1 @ height -= 1 .endif .ifc \ifmt,yuv420p - add r11, r11, r3 @ dst2 += padding - add r12, r12, r5 @ srcY2 += paddingY + tst r1, #1 + itete eq + subeq r6, r6, r0, lsr #1 @ if (height % 2 == 0) paddingU -= (width / 2) + addne r6, r7 @ else paddingU += linesizeU - (width / 2) + subeq r10, r10, r0, lsr #1 @ if (height % 2 == 0) paddingU -= (width / 2) + addne r10, r12 @ else paddingV = linesizeV - (width / 2) - ldr r7, [sp, #116] @ r7 = linesizeU - sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) - add r6, r6, r7 @ srcU += paddingU - - ldr r7, [sp, #124] @ r7 = linesizeV - sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) - add r10, r10, r7 @ srcV += paddingV - - subs r1, r1, #2 @ height -= 2 + subs r1, r1, #1 @ height -= 1 .endif .ifc \ifmt,yuv422p -- 2.7.4
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel