yuv2rgb: save a few instructions by processing the luma line interleaved

Benoit Fouet Thu, 31 Mar 2016 02:18:12 -0700

Hi,

On 28/03/2016 21:19, Matthieu Bouron wrote:

---
  libswscale/arm/yuv2rgb_neon.S | 88 +++++++++++++++++--------------------------
  1 file changed, 34 insertions(+), 54 deletions(-)


diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S
index 124d7d3..6b911c8 100644
--- a/libswscale/arm/yuv2rgb_neon.S
+++ b/libswscale/arm/yuv2rgb_neon.S

[...]

@@ -94,25 +67,29 @@
  .ifc \ofmt,bgra
      compute_rgba        d8, d7, d6, d9, d12, d11, d10, d13
  .endif
+
+    vzip.8              d6, d10
+    vzip.8              d7, d11
+    vzip.8              d8, d12
+    vzip.8              d9, d13


Adding a comment to explain the resulting interleaving would be nice

      vst4.8              {q3, q4}, [\dst,:128]!
      vst4.8              {q5, q6}, [\dst,:128]!
-
  .endm

.macro process_1l ofmt

-    compute_premult     d28, d29, d30, d31
-    vld1.8              {q7}, [r4]!
-    compute             r2, d14, d15, \ofmt
+    compute_premult
+    vld2.8              {d14, d15}, [r4]!
+    compute             r2, \ofmt
  .endm

.macro process_2l ofmt

-    compute_premult     d28, d29, d30, d31
+    compute_premult

- vld1.8 {q7}, [r4]! @ first line of luma

-    compute             r2, d14, d15, \ofmt
+    vld2.8              {d14, d15}, [r4]!                              @ q7 = 
Y (interleaved)
+    compute             r2, \ofmt

- vld1.8 {q7}, [r12]! @ second line of luma

-    compute             r11, d14, d15, \ofmt
+    vld2.8              {d14, d15}, [r12]!                             @ q7 = 
Y (interleaved)
+    compute             r11, \ofmt
  .endm


What about adding a level of macro here? Something like:
.macro process_1l_internal ofmt src_addr res
    compute_premult
    vld2.8            {d14, d15}, [\src_addr]!
    compute        \res, \ofmt
.endm

(again, the naming could be changed, according to your own taste :-) )

This way, we would get:
.macro process_1l ofmt
    process_1l_internal \ofmt, r4, r2
.endm

.macro process_2l ofmt
    process_1l_internal \ofmt, r4,  r2
    process_1l_internal \ofmt, r12, r11
.endm

--
Ben

_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH v2 8/9] swscale/arm/yuv2rgb: save a few instructions by processing the luma line interleaved

Reply via email to