From: Matthieu Bouron <matthieu.bou...@stupeflix.com>

---
 libswscale/arm/yuv2rgb_neon.S | 89 ++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 65 deletions(-)

diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S
index ef7b0a6..8abb986 100644
--- a/libswscale/arm/yuv2rgb_neon.S
+++ b/libswscale/arm/yuv2rgb_neon.S
@@ -105,16 +105,6 @@
     compute_16px        r2, d14, d15, \ofmt
 .endm
 
-.macro process_2l_16px ofmt
-    compute_premult     d28, d29, d30, d31
-
-    vld1.8              {q7}, [r4]!                                    @ first 
line of luma
-    compute_16px        r2, d14, d15, \ofmt
-
-    vld1.8              {q7}, [r12]!                                   @ 
second line of luma
-    compute_16px        r11, d14, d15, \ofmt
-.endm
-
 .macro load_args_nvx
     push                {r4-r12, lr}
     vpush               {q4-q7}
@@ -127,13 +117,9 @@
     ldr                 r10,[sp, #128]                                 @ r10 = 
y_coeff
     vdup.16             d0, r10                                        @ d0  = 
y_coeff
     vld1.16             {d1}, [r8]                                     @ d1  = 
*table
-    add                 r11, r2, r3                                    @ r11 = 
dst + linesize (dst2)
-    add                 r12, r4, r5                                    @ r12 = 
srcY + linesizeY (srcY2)
-    lsl                 r3, r3, #1
-    lsl                 r5, r5, #1
-    sub                 r3, r3, r0, lsl #2                             @ r3 = 
linesize  * 2 - width * 4 (padding)
-    sub                 r5, r5, r0                                     @ r5 = 
linesizeY * 2 - width     (paddingY)
-    sub                 r7, r7, r0                                     @ r7 = 
linesizeC     - width     (paddingC)
+    sub                 r3, r3, r0, lsl #2                             @ r3 = 
linesize  - width * 4 (padding)
+    sub                 r5, r5, r0                                     @ r5 = 
linesizeY - width     (paddingY)
+    sub                 r7, r7, r0                                     @ r7 = 
linesizeC - width     (paddingC)
 .endm
 
 .macro load_args_yuv420p
@@ -142,26 +128,6 @@
     ldr                 r4, [sp, #104]                                 @ r4  = 
srcY
     ldr                 r5, [sp, #108]                                 @ r5  = 
linesizeY
     ldr                 r6, [sp, #112]                                 @ r6  = 
srcU
-    ldr                 r8, [sp, #128]                                 @ r8  = 
table
-    ldr                 r9, [sp, #132]                                 @ r9  = 
y_offset
-    ldr                 r10,[sp, #136]                                 @ r10 = 
y_coeff
-    vdup.16             d0, r10                                        @ d0  = 
y_coeff
-    vld1.16             {d1}, [r8]                                     @ d1  = 
*table
-    add                 r11, r2, r3                                    @ r11 = 
dst + linesize (dst2)
-    add                 r12, r4, r5                                    @ r12 = 
srcY + linesizeY (srcY2)
-    lsl                 r3, r3, #1
-    lsl                 r5, r5, #1
-    sub                 r3, r3, r0, lsl #2                             @ r3 = 
linesize  * 2 - width * 4 (padding)
-    sub                 r5, r5, r0                                     @ r5 = 
linesizeY * 2 - width     (paddingY)
-    ldr                 r10,[sp, #120]                                 @ r10 = 
srcV
-.endm
-
-.macro load_args_yuv422p
-    push                {r4-r12, lr}
-    vpush               {q4-q7}
-    ldr                 r4, [sp, #104]                                 @ r4  = 
srcY
-    ldr                 r5, [sp, #108]                                 @ r5  = 
linesizeY
-    ldr                 r6, [sp, #112]                                 @ r6  = 
srcU
     ldr                 r7, [sp, #116]                                 @ r7  = 
linesizeU
     ldr                 r12,[sp, #124]                                 @ r12 = 
linesizeV
     ldr                 r8, [sp, #128]                                 @ r8  = 
table
@@ -176,6 +142,10 @@
     ldr                 r10,[sp, #120]                                 @ r10 = 
srcV
 .endm
 
+.macro load_args_yuv422p
+    load_args_yuv420p
+.endm
+
 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
 
@@ -205,35 +175,30 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
     vmov.i8             d10, #128
 
 .ifc \ifmt,nv12
-    pld [r12, #64*3]
-
     vld2.8              {d2, d3}, [r6]!                                @ q1: 
interleaved chroma line
     vsubl.u8            q14, d2, d10                                   @ q14 = 
U - 128
     vsubl.u8            q15, d3, d10                                   @ q15 = 
V - 128
 
-    process_2l_16px \ofmt
+    process_1l_16px \ofmt
 .endif
 
 .ifc \ifmt,nv21
-    pld [r12, #64*3]
-
     vld2.8              {d2, d3}, [r6]!                                @ q1: 
interleaved chroma line
     vsubl.u8            q14, d3, d10                                   @ q14 = 
U - 128
     vsubl.u8            q15, d2, d10                                   @ q15 = 
V - 128
 
-    process_2l_16px \ofmt
+    process_1l_16px \ofmt
 .endif
 
 .ifc \ifmt,yuv420p
     pld [r10, #64*3]
-    pld [r12, #64*3]
 
     vld1.8              d2, [r6]!                                      @ d2: 
chroma red line
     vld1.8              d3, [r10]!                                     @ d3: 
chroma blue line
     vsubl.u8            q14, d2, d10                                   @ q14 = 
U - 128
     vsubl.u8            q15, d3, d10                                   @ q15 = 
V - 128
 
-    process_2l_16px \ofmt
+    process_1l_16px \ofmt
 .endif
 
 .ifc \ifmt,yuv422p
@@ -254,35 +219,29 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
     add                 r4, r4, r5                                     @ srcY  
+= paddingY
 
 .ifc \ifmt,nv12
-    add                 r11, r11, r3                                   @ dst2  
+= padding
-    add                 r12, r12, r5                                   @ srcY2 
+= paddingY
-
-    add                 r6, r6, r7                                     @ srcC  
+= paddingC
+    tst                 r1, #1
+    subeq               r6, r6, r0                                     @ if 
(height % 2 == 0) paddingU -= width
+    addne               r6, r7                                         @ else  
               paddingU += linesizeU - width
 
-    subs                r1, r1, #2                                     @ 
height -= 2
+    subs                r1, r1, #1                                     @ 
height -= 1
 .endif
 
 .ifc \ifmt,nv21
-    add                 r11, r11, r3                                   @ dst2  
+= padding
-    add                 r12, r12, r5                                   @ srcY2 
+= paddingY
+    tst                 r1, #1
+    subeq               r6, r6, r0                                     @ if 
(height % 2 == 0) paddingU -= width
+    addne               r6, r7                                         @ else  
               paddingU += linesizeU - width
 
-    add                 r6, r6, r7                                     @ srcC  
+= paddingC
-    subs                r1, r1, #2                                     @ 
height -= 2
+    subs                r1, r1, #1                                     @ 
height -= 1
 .endif
 
 .ifc \ifmt,yuv420p
-    add                 r11, r11, r3                                   @ dst2  
+= padding
-    add                 r12, r12, r5                                   @ srcY2 
+= paddingY
+    tst                 r1, #1
+    subeq               r6, r6, r0, lsr #1                             @ if 
(height % 2 == 0) paddingU -= (width / 2)
+    addne               r6, r7                                         @ else  
               paddingU += linesizeU - (width / 2)
+    subeq               r10, r10, r0, lsr #1                           @ if 
(height % 2 == 0) paddingU -= (width / 2)
+    addne               r10, r12                                       @ else  
               paddingV = linesizeV - (width / 2)
 
-    ldr                 r7, [sp, #116]                                 @ r7    
 = linesizeU
-    sub                 r7, r7, r0, lsr #1                             @ r7    
 = linesizeU - width / 2 (paddingU)
-    add                 r6, r6, r7                                     @ srcU  
+= paddingU
-
-    ldr                 r7, [sp, #124]                                 @ r7    
 = linesizeV
-    sub                 r7, r7, r0, lsr #1                             @ r7    
 = linesizeV - width / 2 (paddingV)
-    add                 r10, r10, r7                                   @ srcV  
+= paddingV
-
-    subs                r1, r1, #2                                     @ 
height -= 2
+    subs                r1, r1, #1                                     @ 
height -= 1
 .endif
 
 .ifc \ifmt,yuv422p
-- 
2.7.4

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to